Software citations

We used R version 4.4.2 [@base] and the following R packages: AnnotationDbi v. 1.66.0 [@AnnotationDbi], BiocManager v. 1.30.23 [@BiocManager], DESeq2 v. 1.44.0 [@DESeq2], GGally v. 2.2.1 [@GGally], ggcorrplot v. 0.1.4.1 [@ggcorrplot], ggpubr v. 0.6.0 [@ggpubr], ggtext v. 0.1.2 [@ggtext], here v. 1.0.1 [@here], hexbin v. 1.28.3 [@hexbin], knitr v. 1.48 [@knitr2014; @knitr2015; @knitr2024], pheatmap v. 1.0.12 [@pheatmap], renv v. 1.0.7 [@renv], rmarkdown v. 2.27 [@rmarkdown2018; @rmarkdown2020; @rmarkdown2024], scales v. 1.3.0 [@scales], svglite v. 2.1.3 [@svglite], tidyverse v. 2.0.0 [@tidyverse], txdbmaker v. 1.0.1 [@txdbmaker], tximport v. 1.32.0 [@tximport], UpSetR v. 1.4.0 [@UpSetR], VennDiagram v. 1.7.3 [@VennDiagram], viridis v. 0.6.5 [@viridis], vsn v. 3.72.0 [@vsn].

Data import

The RDS objects required for this report can be generated using the data import script.

If the RDS objects have already been created in a previous run, the following cell can be uncommented and run, and the remaining cells in the data import section can be skipped. See RDS object section).

Analyses and figures

dir.create(here("results/figures/"), showWarnings = FALSE)

Summary of annotated genes

Number of genes per species:

gene_counts_txi %>%
  group_by(species) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  mutate(total = sum(n), rel.freq = n / total)

Number of genes per species after filtering low counts (n < 10 for all samples):

gene_counts_txi_filtered %>%
  group_by(species) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  mutate(total = sum(n), rel.freq = n / total)

Number of gene types per species:

gene_counts_txi %>%
  group_by(species, gene_type) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  mutate(total = sum(n), rel.freq = round(n / total, 5))
## `summarise()` has grouped output by 'species'. You can override using the
## `.groups` argument.

Number of gene types per species after filtering out low counts (n < 10 for all samples):

gene_counts_txi_filtered %>%
  group_by(species, gene_type) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  mutate(total = sum(n), rel.freq = round(n / total, 5))
## `summarise()` has grouped output by 'species'. You can override using the
## `.groups` argument.
n_genes <- dim(gene_counts_txi)[1]
# stopifnot(dim(gene_counts_txi)[1] == (gene_counts_salmon %>% group_by(sample) %>% summarise(n = n()) %>% select(n) %>% unique()))
n_genes_filtered <- dim(gse_filtered)[1]

n_human_genes <- gene_counts_txi %>%
  filter(species == "human") %>%
  count() %>%
  pull()
n_human_genes_filtered <- gene_counts_txi_filtered %>%
  filter(species == "human") %>%
  count() %>%
  pull()

n_pk_genes <- dim(gse_pk)[1]
stopifnot(dim(gse_pk)[1] == (gene_counts_txi %>% filter(species == "pk") %>% count()))
n_pk_genes_filtered <- dim(gse_pk_filtered)[1]
stopifnot(dim(gse_pk_filtered)[1] == (gene_counts_txi_filtered %>% filter(species == "pk") %>% count()))

Number of unique genes per sample:

gene_counts_txi_filtered %>%
  summarise(across(starts_with("104974"), ~sum(!is.na(.x))))
gene_counts_txi_filtered %>% 
  to_long() %>% 
  group_by(sample) %>% 
  summarize(n = n(),
            n_10 = sum(counts >= 10)
  )

Number of unique Pk genes per sample (after filtering):

gene_counts_txi_filtered %>%
  filter(species=="pk") %>% 
  summarise(across(starts_with("104974"), ~sum(!is.na(.x))))
gene_counts_txi_filtered %>% 
  filter(species=="pk") %>% 
  to_long() %>% 
  group_by(sample) %>% 
  summarize(n = n(),
            n_10 = sum(counts >= 10)
  )

Gene counts

Number of gene counts per sample:

colSums(gene_counts_txi %>% select(starts_with("104974")))
## 104974-001-001_S1_L001 104974-001-002_S1_L001 104974-001-003_S1_L001 
##               22866933               33913208               12906988 
## 104974-001-004_S1_L001 104974-001-005_S1_L001 104974-001-006_S1_L001 
##                7437720               11446209                8903155 
## 104974-001-007_S1_L001 104974-001-008_S1_L001 104974-001-009_S1_L001 
##               34102029               26530404               43610258 
## 104974-001-010_S1_L001 104974-001-011_S1_L001 104974-001-012_S1_L001 
##               45326671               39091317               39997344 
## 104974-001-013_S1_L001 104974-001-014_S1_L001 104974-001-015_S1_L001 
##               61837211               47858788               74893472 
## 104974-001-016_S1_L001 
##               94099625
gene_counts_txi %>% 
  summarise(across(`104974-001-001_S1_L001`:`104974-001-016_S1_L001`, sum)) %>% 
  to_long() %>% 
  left_join(samples, by = "sample") %>%
  ggplot(aes(x = sample_name, y = counts)) +
    geom_bar(stat = "identity") +
    scale_fill_viridis(option = "viridis", discrete = T) +
    scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
    labs(x = "Sample", y = "Total gene counts") +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

Number of Pk gene counts per sample:

colSums(gene_counts_txi %>% filter(species == "pk") %>% select(starts_with("104974")))
## 104974-001-001_S1_L001 104974-001-002_S1_L001 104974-001-003_S1_L001 
##                 516911                1222797                2797701 
## 104974-001-004_S1_L001 104974-001-005_S1_L001 104974-001-006_S1_L001 
##                2574296                4051264                 232640 
## 104974-001-007_S1_L001 104974-001-008_S1_L001 104974-001-009_S1_L001 
##               14320126               13637513               34289699 
## 104974-001-010_S1_L001 104974-001-011_S1_L001 104974-001-012_S1_L001 
##               37905536               31095828                 340147 
## 104974-001-013_S1_L001 104974-001-014_S1_L001 104974-001-015_S1_L001 
##                 637366                 724010                3099047 
## 104974-001-016_S1_L001 
##                1500227
gene_counts_txi %>% filter(species == "pk") %>% 
  summarise(across(`104974-001-001_S1_L001`:`104974-001-016_S1_L001`, sum)) %>% 
  to_long() %>% 
  left_join(samples, by = "sample") %>%
  ggplot(aes(x = sample_name, y = counts)) +
    geom_bar(stat = "identity") +
    scale_fill_viridis(option = "viridis", discrete = T) +
    scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
    labs(x = "Sample", y = "Pk gene counts") +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

Comparison of total RNA levels

The direct comparison of RPKM and TPM across samples is meaningful only when there are equal total RNAs between compared samples and the distribution of RNA populations are close to each other.

Make sure both samples use the same RNA isolation approach [poly(A)+ selection versus ribosomal RNA depletion]. If not, they should not be compared.

Check the fraction of the ribosomal, mitochondrial and globin RNAs, and the top highly expressed transcripts and see whether such RNAs constitute a very large part of the sequenced reads in a sample, and thus decrease the sequencing “real estate” available for the remaining genes in that sample. If the calculated fractions in two samples differ significantly, do not compare RPKM or TPM values directly.

TPM should never be used for quantitative comparisons across samples when the total RNA contents and its distributions are very different. However, under appropriate circumstances, TPM can still be useful for qualitative comparison such as PCA and clustering analysis.

(Zhao et al., 2020 - http://dx.doi.org/10.1261/rna.074922.120)

Looking at our samples, especially the ones prepared using the Illumina kit exhibit much higher total RNA content. Based on the output of Picard MarkDuplicates, the differences can at least partially be attributed to the number of PCR duplicates. The proportion of duplicates is comparable across kits, but they do appear to be consistently higher for some depletion methods (no filter > centpip > plasmo > pmacs > cellulose).

gene_counts_txi_filtered %>%
  summarise(across(`104974-001-001_S1_L001`:`104974-001-016_S1_L001`, sum)) %>%
  to_long() %>%
  left_join(samples, by = "sample") %>%
  ggplot(aes(x = sample_short, y = counts, fill = kit_name)) +
  geom_bar(stat = "identity") +
  scale_fill_viridis(option = "viridis", discrete = T) +
  # labels = c("mRNA Illumina", "mRNA Qiagen globin depletion", "mRNA Qiagen globin/rRNA depletion", "tRNA Qiagen globin/rRNA depletion")) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(x = "Sample", y = "Number of gene-level counts", fill = "Kit")

pcr_dups %>%
  left_join(samples, by = join_by(Sample == sample)) %>%
  mutate(READS_IN_DUPLICATE_PAIRS = 2 * READ_PAIR_DUPLICATES) %>%
  mutate(READS_IN_UNIQUE_PAIRS = 2 * (READ_PAIRS_EXAMINED - READ_PAIR_DUPLICATES)) %>%
  mutate(READS_IN_UNIQUE_UNPAIRED = UNPAIRED_READS_EXAMINED - UNPAIRED_READ_DUPLICATES) %>%
  mutate(READS_IN_DUPLICATE_PAIRS_OPTICAL = 2 * READ_PAIR_OPTICAL_DUPLICATES) %>%
  mutate(READS_IN_DUPLICATE_PAIRS_NONOPTICAL = READS_IN_DUPLICATE_PAIRS - READS_IN_DUPLICATE_PAIRS_OPTICAL) %>%
  mutate(READS_IN_DUPLICATE_UNPAIRED = UNPAIRED_READ_DUPLICATES) %>%
  mutate(READS_UNMAPPED = UNMAPPED_READS) %>%
  select(c(sample_name, condition, kit, READS_IN_UNIQUE_PAIRS, READS_IN_UNIQUE_UNPAIRED, READS_IN_DUPLICATE_PAIRS_OPTICAL, READS_IN_DUPLICATE_PAIRS_NONOPTICAL, READS_IN_DUPLICATE_UNPAIRED)) %>%
  pivot_longer(cols = 4:8, names_to = "type", values_to = "counts") %>%
  ggplot(aes(x = sample_name, y = counts, fill = type)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(option = "viridis", discrete = T) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(x = "Sample", y = "Number of reads", fill = "Picard deduplication stats")

Alignment statistics

The output reported by samtool flagstats and STAR’s Log.final.out files cannot be compared directly: the latter reports counts of read-pairs as opposed to individual reads, and the former does not contain unmapped reads (these are not added to the .bam outputs by default) while also counting multi-mapped reads. We opted to report the STAR statistics, but the flagstat output is included for completion’s sake.

STAR alignment statistics

star_stats
star_stats %>%
  pivot_longer(cols = 2:6, names_to = "type", values_to = "counts") %>%
  left_join(samples, by = join_by(Sample == sample)) %>%
  ggplot(aes(x = sample_name, y = counts, fill = type)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(option = "viridis", discrete = T) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(x = "Sample", y = "Number of read pairs", fill = "Mapping stats")

samtools flagstat

mapping_stats_df
mapping_stats_df %>%
  left_join(samples, by = "sample") %>%
  pivot_longer(cols = flagstat_primary_pk:flagstat_primary_human, names_to = "type", values_to = "counts") %>%
  ggplot(aes(x = sample_name, y = counts, fill = type)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(option = "viridis", discrete = T) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(x = "Sample", y = "Number of primary reads", fill = "Samtools flagstat")

Top expressed reads in TPM

Sample 104974-001-001_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-B2M B2M NA human human 104974-001-001_S1_L001 27679.223
gene-RNR2 RNR2 NA human human_rRNA 104974-001-001_S1_L001 27104.637
gene-S100A8 S100A8 NA human human 104974-001-001_S1_L001 22578.637
gene-S100A9 S100A9 NA human human 104974-001-001_S1_L001 20902.902
gene-ND3 ND3 NA human human 104974-001-001_S1_L001 19021.278
gene-RPS27 RPS27 NA human human 104974-001-001_S1_L001 17762.379
gene-RNR1 RNR1 NA human human_rRNA 104974-001-001_S1_L001 14834.316
gene-COX3 COX3 NA human human 104974-001-001_S1_L001 14662.673
gene-ATP6 ATP6 NA human human 104974-001-001_S1_L001 11625.447
gene-TMSB4X TMSB4X NA human human 104974-001-001_S1_L001 11335.273
gene-FTL FTL NA human human 104974-001-001_S1_L001 10562.604
gene-COX2 COX2 NA human human 104974-001-001_S1_L001 10466.481
gene-RPL41 RPL41 NA human human 104974-001-001_S1_L001 9533.806
gene-ND2 ND2 NA human human 104974-001-001_S1_L001 9363.211
gene-RPLP2 RPLP2 NA human human 104974-001-001_S1_L001 9110.287
gene-RPS20 RPS20 NA human human 104974-001-001_S1_L001 8465.206
gene-RPL21 RPL21 NA human human 104974-001-001_S1_L001 8463.110
gene-RPS12 RPS12 NA human human 104974-001-001_S1_L001 7906.045
gene-COX1 COX1 NA human human 104974-001-001_S1_L001 7773.208
gene-ND1 ND1 NA human human 104974-001-001_S1_L001 7213.298
gene-RPS21 RPS21 NA human human 104974-001-001_S1_L001 7198.383
gene-EEF1A1 EEF1A1 NA human human 104974-001-001_S1_L001 6880.381
gene-RPS24 RPS24 NA human human 104974-001-001_S1_L001 6067.828
gene-RPL30 RPL30 NA human human 104974-001-001_S1_L001 5941.840
gene-B2M-2 B2M NA human human 104974-001-001_S1_L001 5852.971

Sample 104974-001-002_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-B2M B2M NA human human 104974-001-002_S1_L001 34855.539
gene-RNR2 RNR2 NA human human_rRNA 104974-001-002_S1_L001 29430.692
gene-S100A9 S100A9 NA human human 104974-001-002_S1_L001 24583.661
gene-S100A8 S100A8 NA human human 104974-001-002_S1_L001 24327.359
gene-ND3 ND3 NA human human 104974-001-002_S1_L001 16508.778
gene-RNR1 RNR1 NA human human_rRNA 104974-001-002_S1_L001 16214.239
gene-COX3 COX3 NA human human 104974-001-002_S1_L001 15349.098
gene-RPS27 RPS27 NA human human 104974-001-002_S1_L001 13599.609
gene-FTL FTL NA human human 104974-001-002_S1_L001 13013.354
gene-TMSB4X TMSB4X NA human human 104974-001-002_S1_L001 11699.378
gene-ATP6 ATP6 NA human human 104974-001-002_S1_L001 11343.645
gene-COX2 COX2 NA human human 104974-001-002_S1_L001 10378.688
gene-ND2 ND2 NA human human 104974-001-002_S1_L001 9614.317
gene-RPL21 RPL21 NA human human 104974-001-002_S1_L001 8803.669
gene-RPL41 RPL41 NA human human 104974-001-002_S1_L001 8432.933
gene-COX1 COX1 NA human human 104974-001-002_S1_L001 8236.118
gene-RPS12 RPS12 NA human human 104974-001-002_S1_L001 8169.655
gene-RPLP2 RPLP2 NA human human 104974-001-002_S1_L001 7483.747
gene-ND1 ND1 NA human human 104974-001-002_S1_L001 7330.897
gene-EEF1A1 EEF1A1 NA human human 104974-001-002_S1_L001 7075.555
gene-B2M-2 B2M NA human human 104974-001-002_S1_L001 7055.202
gene-HBA2 HBA2 NA human human_globin 104974-001-002_S1_L001 6973.821
gene-RPS20 RPS20 NA human human 104974-001-002_S1_L001 6841.469
gene-SRGN SRGN NA human human 104974-001-002_S1_L001 6536.149
gene-ND4 ND4 NA human human 104974-001-002_S1_L001 5657.471

Sample 104974-001-003_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-HBA2 HBA2 NA human human_globin 104974-001-003_S1_L001 46533.376
gene-FTL FTL NA human human 104974-001-003_S1_L001 29467.269
gene-HBB HBB NA human human_globin 104974-001-003_S1_L001 27466.991
gene-RPL21 RPL21 NA human human 104974-001-003_S1_L001 23643.612
gene-UBA52 UBA52 NA human human 104974-001-003_S1_L001 21219.997
gene-RPS12 RPS12 NA human human 104974-001-003_S1_L001 20864.756
gene-RNR2 RNR2 NA human human_rRNA 104974-001-003_S1_L001 19293.910
gene-UBB UBB NA human human 104974-001-003_S1_L001 17736.970
gene-HBA1 HBA1 NA human human_globin 104974-001-003_S1_L001 16262.189
gene-B2M B2M NA human human 104974-001-003_S1_L001 14549.406
gene-GYPC GYPC NA human human 104974-001-003_S1_L001 12990.299
gene-RPL41 RPL41 NA human human 104974-001-003_S1_L001 11766.569
gene-OAZ1 OAZ1 NA human human 104974-001-003_S1_L001 9783.310
gene-FKBP8 FKBP8 NA human human 104974-001-003_S1_L001 8678.414
gene-S100A8 S100A8 NA human human 104974-001-003_S1_L001 7978.580
gene-S100A9 S100A9 NA human human 104974-001-003_S1_L001 7889.953
gene-SLC25A39 SLC25A39 NA human human 104974-001-003_S1_L001 7762.571
gene-RPS27 RPS27 NA human human 104974-001-003_S1_L001 7712.087
gene-RPLP2 RPLP2 NA human human 104974-001-003_S1_L001 7343.200
gene-RNR1 RNR1 NA human human_rRNA 104974-001-003_S1_L001 6908.576
gene-RPL30 RPL30 NA human human 104974-001-003_S1_L001 6190.096
gene-COX3 COX3 NA human human 104974-001-003_S1_L001 6067.999
gene-RPS11 RPS11 NA human human 104974-001-003_S1_L001 5505.873
gene-ND3 ND3 NA human human 104974-001-003_S1_L001 5258.114
gene-PKNH_0418600 gene-PKNH_0418600 protein_coding pk pk 104974-001-003_S1_L001 5022.868

Sample 104974-001-004_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-HBA2 HBA2 NA human human_globin 104974-001-004_S1_L001 52755.665
gene-HBB HBB NA human human_globin 104974-001-004_S1_L001 22097.339
gene-HBA1 HBA1 NA human human_globin 104974-001-004_S1_L001 21210.681
gene-FTL FTL NA human human 104974-001-004_S1_L001 19832.883
gene-UBA52 UBA52 NA human human 104974-001-004_S1_L001 13148.031
gene-RPL21 RPL21 NA human human 104974-001-004_S1_L001 12112.246
gene-RPS12 RPS12 NA human human 104974-001-004_S1_L001 11859.816
gene-B2M B2M NA human human 104974-001-004_S1_L001 10750.176
gene-UBB UBB NA human human 104974-001-004_S1_L001 9926.561
gene-RNR2 RNR2 NA human human_rRNA 104974-001-004_S1_L001 9832.230
gene-GYPC GYPC NA human human 104974-001-004_S1_L001 8986.872
gene-PKNH_0418600 gene-PKNH_0418600 protein_coding pk pk 104974-001-004_S1_L001 8314.573
gene-S100A8 S100A8 NA human human 104974-001-004_S1_L001 7066.256
gene-S100A9 S100A9 NA human human 104974-001-004_S1_L001 7006.419
gene-FKBP8 FKBP8 NA human human 104974-001-004_S1_L001 6986.322
gene-COX3 COX3 NA human human 104974-001-004_S1_L001 6966.357
gene-OAZ1 OAZ1 NA human human 104974-001-004_S1_L001 6785.591
gene-RPL41 RPL41 NA human human 104974-001-004_S1_L001 6779.247
gene-COX1 COX1 NA human human 104974-001-004_S1_L001 6315.816
gene-RPS27 RPS27 NA human human 104974-001-004_S1_L001 6261.843
gene-SLC25A39 SLC25A39 NA human human 104974-001-004_S1_L001 6113.571
gene-ATP6 ATP6 NA human human 104974-001-004_S1_L001 6088.562
gene-ND3 ND3 NA human human 104974-001-004_S1_L001 5573.493
gene-COX2 COX2 NA human human 104974-001-004_S1_L001 5157.936
gene-RPLP2 RPLP2 NA human human 104974-001-004_S1_L001 5087.297

Sample 104974-001-005_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-HBA2 HBA2 NA human human_globin 104974-001-005_S1_L001 49591.793
gene-FTL FTL NA human human 104974-001-005_S1_L001 44351.515
gene-UBA52 UBA52 NA human human 104974-001-005_S1_L001 36767.580
gene-RPL21 RPL21 NA human human 104974-001-005_S1_L001 35030.120
gene-RPS12 RPS12 NA human human 104974-001-005_S1_L001 30090.973
gene-HBB HBB NA human human_globin 104974-001-005_S1_L001 29687.257
gene-UBB UBB NA human human 104974-001-005_S1_L001 26975.937
gene-RNR2 RNR2 NA human human_rRNA 104974-001-005_S1_L001 26033.024
gene-GYPC GYPC NA human human 104974-001-005_S1_L001 20473.960
gene-RPL41 RPL41 NA human human 104974-001-005_S1_L001 17038.159
gene-HBA1 HBA1 NA human human_globin 104974-001-005_S1_L001 16668.188
gene-OAZ1 OAZ1 NA human human 104974-001-005_S1_L001 14476.128
gene-FKBP8 FKBP8 NA human human 104974-001-005_S1_L001 13027.216
gene-SLC25A39 SLC25A39 NA human human 104974-001-005_S1_L001 12949.204
gene-RNR1 RNR1 NA human human_rRNA 104974-001-005_S1_L001 8443.443
gene-RPLP2 RPLP2 NA human human 104974-001-005_S1_L001 7333.275
gene-RPS11 RPS11 NA human human 104974-001-005_S1_L001 7058.348
gene-PKNH_0418600 gene-PKNH_0418600 protein_coding pk pk 104974-001-005_S1_L001 6826.241
gene-RPL30 RPL30 NA human human 104974-001-005_S1_L001 6393.033
gene-GUK1 GUK1 NA human human 104974-001-005_S1_L001 6026.648
gene-RPL12 RPL12 NA human human 104974-001-005_S1_L001 5719.974
gene-ADIPOR1 ADIPOR1 NA human human 104974-001-005_S1_L001 5600.955
gene-PKNH_1355500 gene-PKNH_1355500 protein_coding pk pk 104974-001-005_S1_L001 5244.759
gene-RPS15 RPS15 NA human human 104974-001-005_S1_L001 4708.231
gene-RPS24 RPS24 NA human human 104974-001-005_S1_L001 4399.440

Sample 104974-001-006_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-B2M B2M NA human human 104974-001-006_S1_L001 28553.397
gene-COX3 COX3 NA human human 104974-001-006_S1_L001 19236.293
gene-S100A9 S100A9 NA human human 104974-001-006_S1_L001 18303.677
gene-TMSB4X TMSB4X NA human human 104974-001-006_S1_L001 18124.259
gene-S100A8 S100A8 NA human human 104974-001-006_S1_L001 16150.164
gene-B2M-2 B2M NA human human 104974-001-006_S1_L001 15785.354
gene-ND3 ND3 NA human human 104974-001-006_S1_L001 15074.849
gene-COX1 COX1 NA human human 104974-001-006_S1_L001 14709.890
gene-COX2 COX2 NA human human 104974-001-006_S1_L001 14454.597
gene-ND2 ND2 NA human human 104974-001-006_S1_L001 14204.682
gene-FTL FTL NA human human 104974-001-006_S1_L001 11557.118
gene-ND1 ND1 NA human human 104974-001-006_S1_L001 10850.009
gene-ATP6 ATP6 NA human human 104974-001-006_S1_L001 10710.461
gene-RPS27 RPS27 NA human human 104974-001-006_S1_L001 9942.583
gene-ND4 ND4 NA human human 104974-001-006_S1_L001 9064.038
gene-ATP8 ATP8 NA human human 104974-001-006_S1_L001 8615.542
gene-EEF1A1 EEF1A1 NA human human 104974-001-006_S1_L001 8459.874
gene-RPS20 RPS20 NA human human 104974-001-006_S1_L001 7580.700
gene-CYTB CYTB NA human human 104974-001-006_S1_L001 7165.345
gene-RPL41 RPL41 NA human human 104974-001-006_S1_L001 7005.028
gene-RPL19 RPL19 NA human human 104974-001-006_S1_L001 6904.254
gene-RPL21 RPL21 NA human human 104974-001-006_S1_L001 6793.811
gene-RPL34 RPL34 NA human human 104974-001-006_S1_L001 5924.000
gene-RPL9 RPL9 NA human human 104974-001-006_S1_L001 5814.001
gene-FTH1 FTH1 NA human human 104974-001-006_S1_L001 5571.215

Sample 104974-001-007_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-RN7SL2 RN7SL2 NA human human 104974-001-007_S1_L001 131068.139
gene-RN7SL1 RN7SL1 NA human human 104974-001-007_S1_L001 92664.227
gene-PKNH_0321100 gene-PKNH_0321100 rRNA pk pk_rRNA 104974-001-007_S1_L001 73099.379
gene-RN7SK RN7SK NA human human 104974-001-007_S1_L001 70738.871
gene-PKNH_1001000 gene-PKNH_1001000 rRNA pk pk_rRNA 104974-001-007_S1_L001 58787.433
gene-PKNH_0320900 gene-PKNH_0320900 rRNA pk pk_rRNA 104974-001-007_S1_L001 27365.645
gene-PKNH_1001200 gene-PKNH_1001200 rRNA pk pk_rRNA 104974-001-007_S1_L001 25716.594
gene-PKNH_0321000 gene-PKNH_0321000 rRNA pk pk_rRNA 104974-001-007_S1_L001 17270.634
gene-RMRP RMRP NA human human 104974-001-007_S1_L001 14366.099
gene-B2M B2M NA human human 104974-001-007_S1_L001 12165.333
gene-PKNH_1339200 gene-PKNH_1339200 ncRNA pk pk 104974-001-007_S1_L001 8712.438
gene-TMSB4X TMSB4X NA human human 104974-001-007_S1_L001 7823.012
gene-COX3 COX3 NA human human 104974-001-007_S1_L001 7685.253
gene-PKNH_1001100 gene-PKNH_1001100 rRNA pk pk_rRNA 104974-001-007_S1_L001 7579.708
gene-ND2 ND2 NA human human 104974-001-007_S1_L001 6852.100
gene-COX1 COX1 NA human human 104974-001-007_S1_L001 6031.814
gene-COX2 COX2 NA human human 104974-001-007_S1_L001 5463.136
gene-ATP6 ATP6 NA human human 104974-001-007_S1_L001 5451.582
gene-S100A8 S100A8 NA human human 104974-001-007_S1_L001 5309.715
gene-ND1 ND1 NA human human 104974-001-007_S1_L001 5106.975
gene-ND4 ND4 NA human human 104974-001-007_S1_L001 4979.826
gene-PKNH_0935200 gene-PKNH_0935200 snRNA pk pk 104974-001-007_S1_L001 4468.936
gene-ND3 ND3 NA human human 104974-001-007_S1_L001 4455.028
gene-S100A9 S100A9 NA human human 104974-001-007_S1_L001 4377.385
gene-RPPH1 RPPH1 NA human human 104974-001-007_S1_L001 4178.458

Sample 104974-001-008_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-RN7SL2 RN7SL2 NA human human 104974-001-008_S1_L001 139279.456
gene-RN7SL1 RN7SL1 NA human human 104974-001-008_S1_L001 88237.728
gene-PKNH_1001000 gene-PKNH_1001000 rRNA pk pk_rRNA 104974-001-008_S1_L001 78313.591
gene-PKNH_0321100 gene-PKNH_0321100 rRNA pk pk_rRNA 104974-001-008_S1_L001 76693.896
gene-RN7SK RN7SK NA human human 104974-001-008_S1_L001 76096.199
gene-PKNH_0320900 gene-PKNH_0320900 rRNA pk pk_rRNA 104974-001-008_S1_L001 30296.682
gene-PKNH_1001200 gene-PKNH_1001200 rRNA pk pk_rRNA 104974-001-008_S1_L001 27906.598
gene-PKNH_0321000 gene-PKNH_0321000 rRNA pk pk_rRNA 104974-001-008_S1_L001 23171.875
gene-RMRP RMRP NA human human 104974-001-008_S1_L001 16032.224
gene-PKNH_1001100 gene-PKNH_1001100 rRNA pk pk_rRNA 104974-001-008_S1_L001 14289.531
gene-B2M B2M NA human human 104974-001-008_S1_L001 12222.046
gene-PKNH_1339200 gene-PKNH_1339200 ncRNA pk pk 104974-001-008_S1_L001 9740.299
gene-PKNH_0935200 gene-PKNH_0935200 snRNA pk pk 104974-001-008_S1_L001 7697.558
gene-COX3 COX3 NA human human 104974-001-008_S1_L001 7411.967
gene-TMSB4X TMSB4X NA human human 104974-001-008_S1_L001 6867.503
gene-ND2 ND2 NA human human 104974-001-008_S1_L001 5897.639
gene-COX1 COX1 NA human human 104974-001-008_S1_L001 5487.208
gene-S100A8 S100A8 NA human human 104974-001-008_S1_L001 5483.937
gene-COX2 COX2 NA human human 104974-001-008_S1_L001 5235.476
gene-RPPH1 RPPH1 NA human human 104974-001-008_S1_L001 5041.748
gene-RPPH1-2 RPPH1 NA human human 104974-001-008_S1_L001 4996.489
gene-ATP6 ATP6 NA human human 104974-001-008_S1_L001 4855.997
gene-S100A9 S100A9 NA human human 104974-001-008_S1_L001 4804.481
gene-ND1 ND1 NA human human 104974-001-008_S1_L001 4769.538
gene-ND4 ND4 NA human human 104974-001-008_S1_L001 4491.039

Sample 104974-001-009_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-RN7SL2 RN7SL2 NA human human 104974-001-009_S1_L001 145681.116
gene-PKNH_0321100 gene-PKNH_0321100 rRNA pk pk_rRNA 104974-001-009_S1_L001 144323.085
gene-PKNH_1001000 gene-PKNH_1001000 rRNA pk pk_rRNA 104974-001-009_S1_L001 113693.416
gene-RN7SL1 RN7SL1 NA human human 104974-001-009_S1_L001 97956.989
gene-PKNH_0320900 gene-PKNH_0320900 rRNA pk pk_rRNA 104974-001-009_S1_L001 58482.134
gene-PKNH_0321000 gene-PKNH_0321000 rRNA pk pk_rRNA 104974-001-009_S1_L001 54071.123
gene-PKNH_1001200 gene-PKNH_1001200 rRNA pk pk_rRNA 104974-001-009_S1_L001 53881.108
gene-PKNH_1001100 gene-PKNH_1001100 rRNA pk pk_rRNA 104974-001-009_S1_L001 24293.746
gene-RN7SK RN7SK NA human human 104974-001-009_S1_L001 20907.443
gene-PKNH_0935200 gene-PKNH_0935200 snRNA pk pk 104974-001-009_S1_L001 18472.293
gene-PKNH_1339200 gene-PKNH_1339200 ncRNA pk pk 104974-001-009_S1_L001 12768.772
gene-HBB HBB NA human human_globin 104974-001-009_S1_L001 6401.856
gene-FTL FTL NA human human 104974-001-009_S1_L001 5156.650
gene-UBB UBB NA human human 104974-001-009_S1_L001 3804.390
gene-ADIPOR1 ADIPOR1 NA human human 104974-001-009_S1_L001 3777.510
gene-RPPH1-2 RPPH1 NA human human 104974-001-009_S1_L001 3718.740
gene-RPPH1 RPPH1 NA human human 104974-001-009_S1_L001 3692.364
gene-RNY1 RNY1 NA human human 104974-001-009_S1_L001 3687.490
gene-RNA28SN4 RNA28SN4 NA human human_rRNA 104974-001-009_S1_L001 3199.299
gene-GYPC GYPC NA human human 104974-001-009_S1_L001 3180.375
gene-RPL21 RPL21 NA human human 104974-001-009_S1_L001 2826.477
gene-HBA2 HBA2 NA human human_globin 104974-001-009_S1_L001 2678.665
gene-RMRP RMRP NA human human 104974-001-009_S1_L001 2494.538
gene-UBA52 UBA52 NA human human 104974-001-009_S1_L001 2312.951
gene-PKNH_0418600 gene-PKNH_0418600 protein_coding pk pk 104974-001-009_S1_L001 2094.309

Sample 104974-001-010_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-PKNH_0321100 gene-PKNH_0321100 rRNA pk pk_rRNA 104974-001-010_S1_L001 181501.107
gene-PKNH_1001000 gene-PKNH_1001000 rRNA pk pk_rRNA 104974-001-010_S1_L001 143352.902
gene-RN7SL2 RN7SL2 NA human human 104974-001-010_S1_L001 86488.492
gene-PKNH_0320900 gene-PKNH_0320900 rRNA pk pk_rRNA 104974-001-010_S1_L001 70146.903
gene-PKNH_1001200 gene-PKNH_1001200 rRNA pk pk_rRNA 104974-001-010_S1_L001 66180.869
gene-RN7SL1 RN7SL1 NA human human 104974-001-010_S1_L001 60353.922
gene-PKNH_0321000 gene-PKNH_0321000 rRNA pk pk_rRNA 104974-001-010_S1_L001 54735.717
gene-PKNH_1001100 gene-PKNH_1001100 rRNA pk pk_rRNA 104974-001-010_S1_L001 20313.046
gene-PKNH_0935200 gene-PKNH_0935200 snRNA pk pk 104974-001-010_S1_L001 16268.334
gene-RN7SK RN7SK NA human human 104974-001-010_S1_L001 13867.096
gene-PKNH_1339200 gene-PKNH_1339200 ncRNA pk pk 104974-001-010_S1_L001 9108.890
gene-RNA28SN4 RNA28SN4 NA human human_rRNA 104974-001-010_S1_L001 3993.278
gene-HBA2 HBA2 NA human human_globin 104974-001-010_S1_L001 3889.037
gene-HBB HBB NA human human_globin 104974-001-010_S1_L001 3806.526
gene-ADIPOR1 ADIPOR1 NA human human 104974-001-010_S1_L001 3607.743
gene-FTL FTL NA human human 104974-001-010_S1_L001 3459.588
gene-HBA1 HBA1 NA human human_globin 104974-001-010_S1_L001 3183.081
gene-GYPC GYPC NA human human 104974-001-010_S1_L001 2799.467
gene-RPPH1 RPPH1 NA human human 104974-001-010_S1_L001 2659.526
gene-PKNH_0418600 gene-PKNH_0418600 protein_coding pk pk 104974-001-010_S1_L001 2645.063
gene-RPPH1-2 RPPH1 NA human human 104974-001-010_S1_L001 2627.532
gene-PKNH_1132600 gene-PKNH_1132600 protein_coding pk pk 104974-001-010_S1_L001 2592.234
gene-COX1 COX1 NA human human 104974-001-010_S1_L001 2414.909
gene-UBB UBB NA human human 104974-001-010_S1_L001 2265.004
gene-RMRP RMRP NA human human 104974-001-010_S1_L001 2211.677

Sample 104974-001-011_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-RN7SL2 RN7SL2 NA human human 104974-001-011_S1_L001 161042.174
gene-PKNH_0321100 gene-PKNH_0321100 rRNA pk pk_rRNA 104974-001-011_S1_L001 147935.000
gene-PKNH_1001000 gene-PKNH_1001000 rRNA pk pk_rRNA 104974-001-011_S1_L001 115582.956
gene-RN7SL1 RN7SL1 NA human human 104974-001-011_S1_L001 108220.532
gene-PKNH_0320900 gene-PKNH_0320900 rRNA pk pk_rRNA 104974-001-011_S1_L001 61256.956
gene-PKNH_1001200 gene-PKNH_1001200 rRNA pk pk_rRNA 104974-001-011_S1_L001 55243.977
gene-PKNH_0321000 gene-PKNH_0321000 rRNA pk pk_rRNA 104974-001-011_S1_L001 42015.846
gene-PKNH_1001100 gene-PKNH_1001100 rRNA pk pk_rRNA 104974-001-011_S1_L001 18712.764
gene-RN7SK RN7SK NA human human 104974-001-011_S1_L001 10855.497
gene-PKNH_0935200 gene-PKNH_0935200 snRNA pk pk 104974-001-011_S1_L001 10217.904
gene-PKNH_1339200 gene-PKNH_1339200 ncRNA pk pk 104974-001-011_S1_L001 7907.086
gene-FTL FTL NA human human 104974-001-011_S1_L001 6860.040
gene-HBB HBB NA human human_globin 104974-001-011_S1_L001 5702.090
gene-ADIPOR1 ADIPOR1 NA human human 104974-001-011_S1_L001 5452.125
gene-UBB UBB NA human human 104974-001-011_S1_L001 5000.284
gene-GYPC GYPC NA human human 104974-001-011_S1_L001 4800.788
gene-RNA28SN4 RNA28SN4 NA human human_rRNA 104974-001-011_S1_L001 4511.897
gene-RPL21 RPL21 NA human human 104974-001-011_S1_L001 4355.019
gene-RPPH1-2 RPPH1 NA human human 104974-001-011_S1_L001 3979.430
gene-RPPH1 RPPH1 NA human human 104974-001-011_S1_L001 3978.027
gene-SLC25A39 SLC25A39 NA human human 104974-001-011_S1_L001 3085.912
gene-RNY1 RNY1 NA human human 104974-001-011_S1_L001 2933.827
gene-UBA52 UBA52 NA human human 104974-001-011_S1_L001 2915.853
gene-OAZ1 OAZ1 NA human human 104974-001-011_S1_L001 2894.726
gene-HBA2 HBA2 NA human human_globin 104974-001-011_S1_L001 2852.671

Sample 104974-001-012_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-HBB HBB NA human human_globin 104974-001-012_S1_L001 366480.180
gene-HBA2 HBA2 NA human human_globin 104974-001-012_S1_L001 263245.083
gene-HBA1 HBA1 NA human human_globin 104974-001-012_S1_L001 98009.922
gene-B2M B2M NA human human 104974-001-012_S1_L001 6733.582
gene-RNR2 RNR2 NA human human_rRNA 104974-001-012_S1_L001 5784.927
gene-S100A9 S100A9 NA human human 104974-001-012_S1_L001 5749.014
gene-S100A8 S100A8 NA human human 104974-001-012_S1_L001 4593.144
gene-RNA28SN4 RNA28SN4 NA human human_rRNA 104974-001-012_S1_L001 4190.916
gene-LOC124905315 LOC124905315 NA human human 104974-001-012_S1_L001 3825.278
gene-RNR1 RNR1 NA human human_rRNA 104974-001-012_S1_L001 3549.196
gene-TMSB4X TMSB4X NA human human 104974-001-012_S1_L001 3347.079
gene-RPS27 RPS27 NA human human 104974-001-012_S1_L001 3045.938
gene-FTL FTL NA human human 104974-001-012_S1_L001 3016.675
gene-ND3 ND3 NA human human 104974-001-012_S1_L001 2750.724
gene-COX3 COX3 NA human human 104974-001-012_S1_L001 2577.031
gene-RPL41 RPL41 NA human human 104974-001-012_S1_L001 2375.402
gene-RPS12 RPS12 NA human human 104974-001-012_S1_L001 2212.215
gene-COX2 COX2 NA human human 104974-001-012_S1_L001 2207.033
gene-RPL39 RPL39 NA human human 104974-001-012_S1_L001 2132.982
gene-ATP8 ATP8 NA human human 104974-001-012_S1_L001 1919.579
gene-EEF1A1 EEF1A1 NA human human 104974-001-012_S1_L001 1903.976
gene-RNA28SN2 RNA28SN2 NA human human_rRNA 104974-001-012_S1_L001 1875.808
gene-ATP6 ATP6 NA human human 104974-001-012_S1_L001 1867.009
gene-RNA18SN2 RNA18SN2 NA human human_rRNA 104974-001-012_S1_L001 1677.114
gene-RNA18SN3 RNA18SN3 NA human human_rRNA 104974-001-012_S1_L001 1671.307

Sample 104974-001-013_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-HBB HBB NA human human_globin 104974-001-013_S1_L001 388222.402
gene-HBA2 HBA2 NA human human_globin 104974-001-013_S1_L001 289856.854
gene-HBA1 HBA1 NA human human_globin 104974-001-013_S1_L001 103444.535
gene-S100A9 S100A9 NA human human 104974-001-013_S1_L001 6019.643
gene-B2M B2M NA human human 104974-001-013_S1_L001 5886.942
gene-RNR2 RNR2 NA human human_rRNA 104974-001-013_S1_L001 5240.822
gene-S100A8 S100A8 NA human human 104974-001-013_S1_L001 4629.461
gene-RNR1 RNR1 NA human human_rRNA 104974-001-013_S1_L001 3265.530
gene-FTL FTL NA human human 104974-001-013_S1_L001 3136.281
gene-TMSB4X TMSB4X NA human human 104974-001-013_S1_L001 2718.495
gene-RNA28SN4 RNA28SN4 NA human human_rRNA 104974-001-013_S1_L001 2457.827
gene-COX3 COX3 NA human human 104974-001-013_S1_L001 2138.235
gene-ND3 ND3 NA human human 104974-001-013_S1_L001 2125.318
gene-RPS27 RPS27 NA human human 104974-001-013_S1_L001 2021.781
gene-RPS12 RPS12 NA human human 104974-001-013_S1_L001 1910.314
gene-RPL41 RPL41 NA human human 104974-001-013_S1_L001 1851.973
gene-LOC124905315 LOC124905315 NA human human 104974-001-013_S1_L001 1741.411
gene-COX2 COX2 NA human human 104974-001-013_S1_L001 1678.269
gene-FTH1 FTH1 NA human human 104974-001-013_S1_L001 1635.636
gene-RPL39 RPL39 NA human human 104974-001-013_S1_L001 1558.862
gene-ATP8 ATP8 NA human human 104974-001-013_S1_L001 1532.902
gene-EEF1A1 EEF1A1 NA human human 104974-001-013_S1_L001 1446.312
gene-ATP6 ATP6 NA human human 104974-001-013_S1_L001 1419.527
gene-RPL21 RPL21 NA human human 104974-001-013_S1_L001 1331.099
gene-COX1 COX1 NA human human 104974-001-013_S1_L001 1240.701

Sample 104974-001-014_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-HBB HBB NA human human_globin 104974-001-014_S1_L001 523264.4084
gene-HBA2 HBA2 NA human human_globin 104974-001-014_S1_L001 284480.8538
gene-HBA1 HBA1 NA human human_globin 104974-001-014_S1_L001 114371.1571
gene-LOC124905315 LOC124905315 NA human human 104974-001-014_S1_L001 2918.1146
gene-RNA28SN4 RNA28SN4 NA human human_rRNA 104974-001-014_S1_L001 2536.0602
gene-FTL FTL NA human human 104974-001-014_S1_L001 1871.6873
gene-RNA18SN3-2 RNA18SN3 NA human human_rRNA 104974-001-014_S1_L001 1543.4707
gene-RNA18SN4 RNA18SN4 NA human human_rRNA 104974-001-014_S1_L001 1542.5888
gene-RNA18SN3 RNA18SN3 NA human human_rRNA 104974-001-014_S1_L001 1542.4997
gene-RNA18SN2 RNA18SN2 NA human human_rRNA 104974-001-014_S1_L001 1535.9067
gene-RPS12 RPS12 NA human human 104974-001-014_S1_L001 1366.6827
gene-RNA28SN2 RNA28SN2 NA human human_rRNA 104974-001-014_S1_L001 1296.8516
gene-RNR2 RNR2 NA human human_rRNA 104974-001-014_S1_L001 1166.9362
gene-UBA52 UBA52 NA human human 104974-001-014_S1_L001 1068.1666
gene-RPL21 RPL21 NA human human 104974-001-014_S1_L001 1028.5515
gene-UBB UBB NA human human 104974-001-014_S1_L001 892.5544
gene-GYPC GYPC NA human human 104974-001-014_S1_L001 851.2235
gene-HBG2 HBG2 NA human human_globin 104974-001-014_S1_L001 842.1872
gene-FKBP8 FKBP8 NA human human 104974-001-014_S1_L001 814.0800
gene-B2M B2M NA human human 104974-001-014_S1_L001 783.0767
gene-RPL41 RPL41 NA human human 104974-001-014_S1_L001 715.1521
gene-RNA5-8SN4 RNA5-8SN4 NA human human_rRNA 104974-001-014_S1_L001 636.0227
gene-OAZ1 OAZ1 NA human human 104974-001-014_S1_L001 599.3413
gene-S100A9 S100A9 NA human human 104974-001-014_S1_L001 578.5453
gene-SLC25A39 SLC25A39 NA human human 104974-001-014_S1_L001 555.1666

Sample 104974-001-015_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-HBB HBB NA human human_globin 104974-001-015_S1_L001 448423.0722
gene-HBA2 HBA2 NA human human_globin 104974-001-015_S1_L001 316409.7173
gene-HBA1 HBA1 NA human human_globin 104974-001-015_S1_L001 127791.9575
gene-LOC124905315 LOC124905315 NA human human 104974-001-015_S1_L001 3596.4548
gene-RNA28SN4 RNA28SN4 NA human human_rRNA 104974-001-015_S1_L001 3411.2697
gene-RNA18SN2 RNA18SN2 NA human human_rRNA 104974-001-015_S1_L001 1762.2669
gene-RNA18SN4 RNA18SN4 NA human human_rRNA 104974-001-015_S1_L001 1756.8958
gene-RNA18SN3 RNA18SN3 NA human human_rRNA 104974-001-015_S1_L001 1753.9691
gene-RNA18SN3-2 RNA18SN3 NA human human_rRNA 104974-001-015_S1_L001 1750.1382
gene-FTL FTL NA human human 104974-001-015_S1_L001 1660.3870
gene-RNA28SN2 RNA28SN2 NA human human_rRNA 104974-001-015_S1_L001 1615.4719
gene-RPS12 RPS12 NA human human 104974-001-015_S1_L001 1099.8989
gene-UBA52 UBA52 NA human human 104974-001-015_S1_L001 1020.6326
gene-PKNH_0418600 gene-PKNH_0418600 protein_coding pk pk 104974-001-015_S1_L001 925.7916
gene-FKBP8 FKBP8 NA human human 104974-001-015_S1_L001 917.5296
gene-GYPC GYPC NA human human 104974-001-015_S1_L001 858.6056
gene-B2M B2M NA human human 104974-001-015_S1_L001 792.5917
gene-RPL21 RPL21 NA human human 104974-001-015_S1_L001 779.4010
gene-RNR2 RNR2 NA human human_rRNA 104974-001-015_S1_L001 768.8011
gene-UBB UBB NA human human 104974-001-015_S1_L001 734.8697
gene-S100A9 S100A9 NA human human 104974-001-015_S1_L001 692.6973
gene-HBG2 HBG2 NA human human_globin 104974-001-015_S1_L001 663.0136
gene-OAZ1 OAZ1 NA human human 104974-001-015_S1_L001 576.5540
gene-SLC25A39 SLC25A39 NA human human 104974-001-015_S1_L001 567.3799
gene-RPL41 RPL41 NA human human 104974-001-015_S1_L001 566.2701

Sample 104974-001-016_S1_L001

gene_id gene_name gene_biotype species gene_type sample counts
gene-HBB HBB NA human human_globin 104974-001-016_S1_L001 514171.8546
gene-HBA2 HBA2 NA human human_globin 104974-001-016_S1_L001 301490.6876
gene-HBA1 HBA1 NA human human_globin 104974-001-016_S1_L001 122126.6477
gene-LOC124905315 LOC124905315 NA human human 104974-001-016_S1_L001 3512.6965
gene-RNA28SN4 RNA28SN4 NA human human_rRNA 104974-001-016_S1_L001 3033.8382
gene-RNA18SN2 RNA18SN2 NA human human_rRNA 104974-001-016_S1_L001 2096.6635
gene-RNA18SN3 RNA18SN3 NA human human_rRNA 104974-001-016_S1_L001 2086.6004
gene-RNA18SN4 RNA18SN4 NA human human_rRNA 104974-001-016_S1_L001 2073.9575
gene-RNA18SN3-2 RNA18SN3 NA human human_rRNA 104974-001-016_S1_L001 2066.1062
gene-FTL FTL NA human human 104974-001-016_S1_L001 1780.1712
gene-RNA28SN2 RNA28SN2 NA human human_rRNA 104974-001-016_S1_L001 1534.3552
gene-RPS12 RPS12 NA human human 104974-001-016_S1_L001 1288.8221
gene-UBA52 UBA52 NA human human 104974-001-016_S1_L001 1087.6524
gene-RPL21 RPL21 NA human human 104974-001-016_S1_L001 990.7414
gene-RNR2 RNR2 NA human human_rRNA 104974-001-016_S1_L001 990.5111
gene-UBB UBB NA human human 104974-001-016_S1_L001 922.4129
gene-HBG2 HBG2 NA human human_globin 104974-001-016_S1_L001 911.4972
gene-GYPC GYPC NA human human 104974-001-016_S1_L001 883.1664
gene-FKBP8 FKBP8 NA human human 104974-001-016_S1_L001 818.6246
gene-OAZ1 OAZ1 NA human human 104974-001-016_S1_L001 600.4898
gene-RPL41 RPL41 NA human human 104974-001-016_S1_L001 559.7779
gene-SLC25A39 SLC25A39 NA human human 104974-001-016_S1_L001 555.5004
gene-RNA5-8SN4 RNA5-8SN4 NA human human_rRNA 104974-001-016_S1_L001 525.1435
gene-RN7SL2 RN7SL2 NA human human 104974-001-016_S1_L001 410.6589
gene-PKNH_0418600 gene-PKNH_0418600 protein_coding pk pk 104974-001-016_S1_L001 380.8039

FastQ Screen statistics

Also see figure in results/fastq-screen/ and results/multiqc-fastq-screen-samtools-pk/multiqc_report.html.

All reads for both species

fastq_screen_transformed <- bind_rows(
  fastq_screen %>%
    mutate(`%_single_genome` = `%_One_hit_one_genome` + `%_Multiple_hits_one_genome`) %>%
    mutate(`%_multiple_genomes` = `%_One_hit_multiple_genomes` + `%_Multiple_hits_multiple_genomes`) %>%
    mutate(`No_hits` = `%_One_hit_multiple_genomes` + `%_Multiple_hits_multiple_genomes`) %>%
    select(c(sample, Genome, `#Reads_processed`, `%_single_genome`, `%_multiple_genomes`)) %>%
    pivot_wider(names_from = Genome, values_from = c(`%_single_genome`, `%_multiple_genomes`)) %>%
    select(!`%_multiple_genomes_PknowlesiH`) %>%
    rename(`%_multiple_genomes` = `%_multiple_genomes_Human`) %>%
    mutate(pct = 1 - `%_single_genome_PknowlesiH` - `%_single_genome_Human` - `%_multiple_genomes`) %>%
    select(sample, pct) %>%
    mutate("Mapped reads" = "No hits"),
  fastq_screen %>%
    mutate(pct = `%_One_hit_one_genome` + `%_Multiple_hits_one_genome`) %>%
    select(c(sample, Genome, pct)) %>%
    rename("Mapped reads" = Genome),
  fastq_screen %>%
    mutate(pct = `%_One_hit_multiple_genomes` + `%_Multiple_hits_multiple_genomes`) %>%
    select(c(sample, pct)) %>%
    distinct(sample, .keep_all = TRUE) %>%
    mutate("Mapped reads" = "Multiple genomes")
) %>%
  # change order of levels here to adjust colours
  mutate(`Mapped reads` = factor(`Mapped reads`,
    levels = c("PknowlesiH", "Human", "Multiple genomes", "No hits"),
    labels = c("P. knowlesi", "Human", "Multiple hits", "No hits")
  )) %>%
  left_join(samples, by = "sample")

ggplot(fastq_screen_transformed, aes(x = sample_short, y = pct, fill = `Mapped reads`)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(option = "viridis", discrete = T, limits = rev) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  labs(fill = expression("FastQ Screen mapped reads"), x = "Sample") +
  ylab(label = "% of mapped reads")

Human reads only

# Human reads
fastq_screen %>%
  pivot_longer(
    cols = starts_with("%"),
    names_to = "fastq_screen_pct_type",
    values_to = "fastq_screen_pct_value"
  ) %>%
  filter(Genome == "Human") %>%
  left_join(samples) %>%
  ggplot(aes(x = sample_short, y = fastq_screen_pct_value, fill = fastq_screen_pct_type)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(option = "viridis", discrete = T) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  labs(fill = "FastQ screen statistics for human reads", x = "Sample", y = "% of mapped reads")
## Joining with `by = join_by(sample)`

Pk reads

# Pk reads
fastq_screen %>%
  pivot_longer(
    cols = starts_with("%"),
    names_to = "fastq_screen_pct_type",
    values_to = "fastq_screen_pct_value"
  ) %>%
  filter(Genome == "PknowlesiH") %>%
  left_join(samples) %>%
  ggplot(aes(x = sample_short, y = fastq_screen_pct_value, fill = fastq_screen_pct_type)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(option = "viridis", discrete = T) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  labs(fill = expression("FastQ screen statistics for" ~ italic("P. knowlesi") ~ "reads"), x = "Sample", y = "% of mapped reads")
## Joining with `by = join_by(sample)`

rRNA and globin levels

Barplot of TPM for rRNA genes per condition

gene_annotation %>%
  # select rRNA and globins
  filter(!gene_type %in% c("human", "pk")) %>%
  left_join(gene_tpm_filtered[1:18], by = c("gene_id", "gene_name")) %>%
  filter(rowSums(across(.cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`)) != 0) %>%
  pivot_longer(
    cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`,
    names_to = "sample",
    values_to = "counts"
  ) %>%
  left_join(samples, by = "sample") %>%
  group_by(gene_id) %>%
  mutate(total = sum(counts)) %>%
  ungroup() %>%
  arrange(species, gene_type, desc(total)) %>%
  mutate(gene_name = factor(gene_name, levels = unique(gene_name))) %>%
  ggplot(aes(x = gene_name, y = counts, fill = sample_short)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(option = "viridis", discrete = T) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  labs(fill = "Sample") +
  ylab(label = "TPM") +
  xlab(label = "Gene name") +
  facet_wrap(~kit_name) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale()))

Dot plot of log(TPM+1) for rRNA genes per condition

Log TPM is often plotted instead of raw values, but can be misleading on barplot…

gene_annotation %>%
  filter(!gene_type %in% c("human", "pk")) %>%
  left_join(gene_tpm_filtered[1:18], by = c("gene_id", "gene_name")) %>%
  filter(rowSums(across(.cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`)) != 0) %>%
  pivot_longer(
    cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`,
    names_to = "sample",
    values_to = "counts"
  ) %>%
  left_join(samples, by = "sample") %>%
  group_by(gene_id) %>%
  mutate(total = sum(counts)) %>%
  ungroup() %>%
  arrange(species, gene_type, desc(total)) %>%
  mutate(gene_name = factor(gene_name, levels = unique(gene_name))) %>%
  ggplot(aes(x = gene_name, y = counts)) +
  geom_point(aes(colour = sample_name)) +
  scale_colour_viridis(option = "viridis", discrete = T) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  # labs(colour = "Sample") +
  labs(x = "Gene name", y = "log(TPM+1)", colour = "Sample") +
  facet_wrap(~sample_name) +
  scale_y_continuous(trans = "log1p")

Globin only

gene_annotation %>%
  filter(gene_type == "human_globin") %>%
  left_join(gene_tpm_filtered[1:18], by = c("gene_id", "gene_name")) %>%
  filter(rowSums(across(.cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`)) != 0) %>%
  pivot_longer(
    cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`,
    names_to = "sample",
    values_to = "counts"
  ) %>%
  left_join(samples, by = "sample") %>%
  group_by(gene_id) %>%
  mutate(total = sum(counts)) %>%
  ungroup() %>%
  arrange(species, gene_type, desc(total)) %>%
  mutate(gene_name = factor(gene_name, levels = unique(gene_name))) %>%
  ggplot(aes(x = gene_name, y = counts)) +
  geom_point(aes(colour = sample)) +
  scale_colour_viridis(option = "viridis", discrete = T) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  labs(x = "Gene name", y = "TPM", colour = "Sample") +
  facet_wrap(~sample_name) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale()))

Human rRNA only

gene_annotation %>%
  filter(gene_type == "human_rRNA") %>%
  left_join(gene_tpm_filtered[1:18], by = c("gene_id", "gene_name")) %>%
  filter(rowSums(across(.cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`)) != 0) %>%
  pivot_longer(
    cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`,
    names_to = "sample",
    values_to = "counts"
  ) %>%
  left_join(samples, by = "sample") %>%
  group_by(gene_id) %>%
  mutate(total = sum(counts)) %>%
  ungroup() %>%
  arrange(species, gene_type, desc(total)) %>%
  mutate(gene_name = factor(gene_name, levels = unique(gene_name))) %>%
  ggplot(aes(x = gene_name, y = counts)) +
  geom_point(aes(colour = sample)) +
  scale_colour_viridis(option = "viridis", discrete = T) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  labs(x = "Gene name", y = "TPM", colour = "Sample") +
  facet_wrap(~sample_name) +
  scale_y_continuous(trans = "log1p")

Plot percentage of reads for each gene type (rRNA/globin) and species

TPM values appear to be the most appropriate metric to use.

Gene-level counts

These gene counts were aggregated across all samples (tximport of transcript counts) and filtered to only genes with a minimum count of 10 reads for at least 1 sample.

gene_counts_txi_filtered_long %>%
  left_join(samples, by = "sample") %>%
  group_by(sample_name, gene_type) %>%
  summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
  mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
  ggplot(aes(y = sum_reads, x = sample_name, fill = gene_type)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(
    option = "viridis", discrete = T,
    # limits = rev,
    labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
  ) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(x = "Sample", y = "Sum of gene counts", fill = "RNA type")

Gene-level TPM

Filtered to only genes with a minimum count of 10 reads for at least 1 sample.

gene_tpm_filtered_long %>%
  left_join(samples, by = "sample") %>%
  group_by(sample_name, gene_type) %>%
  summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
  mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
  ggplot(aes(y = sum_reads, x = sample_name, fill = gene_type)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(
    option = "viridis", discrete = T,
    # limits = rev,
    labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
  ) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(x = "Sample", y = "Sum of TPM counts", fill = "RNA type")

All gene biotypes:

gene_tpm_filtered_long %>%
  left_join(samples, by = "sample") %>%
  mutate(gene_type = case_when(species == "pk" ~ gene_biotype, TRUE ~ gene_type)) %>%
  group_by(sample_name, gene_type) %>%
  summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
  mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
  ggplot(aes(y = sum_reads, x = sample_name, fill = gene_type)) +
  geom_bar(position = "stack", stat = "identity") +
  # scale_fill_viridis(
    # option = "viridis", discrete = T,
    # limits = rev,
    # labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
  # ) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(x = "Sample", y = "Sum of TPM counts", fill = "RNA type")

Plot percentage of reads for each species

Gene-level counts

These gene counts were aggregated across all samples (tximport of transcript counts) and filtered to only genes with a minimum count of 10 reads for at least 1 sample.

# absolute numbers
gene_counts_txi_filtered_long %>%
  left_join(samples, by = "sample") %>%
  group_by(sample_name, species) %>%
  summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
  mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
  ggplot(aes(y = sum_reads, x = sample_name, fill = species)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(
    option = "viridis", discrete = T,
    # limits = rev
  ) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(x = "Sample", y = "Sum of gene counts", fill = "Species")

# proportion
gene_counts_txi_filtered_long %>%
  left_join(samples, by = "sample") %>%
  group_by(sample_name, species) %>%
  summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
  mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
  ggplot(aes(y = sum_reads, x = sample_name, fill = species)) +
  geom_bar(position = "fill", stat = "identity") +
  scale_fill_viridis(
    option = "viridis", discrete = T,
    # limits = rev
  ) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(x = "Sample", y = "Proportion of gene counts", fill = "Species")

Gene-level TPM

Note that since TPM is already scaled by library size, the absolute and proportion plots are identical.

# absolute numbers
gene_tpm_filtered_long %>%
  left_join(samples, by = "sample") %>%
  group_by(sample_name, species) %>%
  summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
  mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
  ggplot(aes(y = sum_reads, x = sample_name, fill = species)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(
    option = "viridis", discrete = T,
    # limits = rev
  ) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(x = "Sample", y = "Sum of TPM counts", fill = "Species")

# proportion
gene_tpm_long %>%
  left_join(samples, by = "sample") %>%
  group_by(sample_name, species) %>%
  summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
  mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
  ggplot(aes(y = sum_reads, x = sample_name, fill = species)) +
  geom_bar(position = "fill", stat = "identity") +
  scale_fill_viridis(
    option = "viridis", discrete = T,
    # limits = rev
  ) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(x = "Sample", y = "Proportion of TPM counts", fill = "Species")

Gene recovery

# create binary matrices with presence/absence of genes
gene_tpm_filtered_binary <- gene_tpm_filtered %>%
  mutate(across(.cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`, .fns = ~ .x > 0)) %>%
  select(!c(gene_name, gene_type, species))

gene_tpm_pk_filtered_binary <- gene_tpm_filtered %>%
  mutate(across(.cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`, .fns = ~ .x > 0)) %>%
  filter(species == "pk") %>%
  select(!c(gene_name, gene_type, species))

gene_tpm_filtered_binary_list <- purrr::set_names(colnames(gene_tpm_filtered_binary %>% select(`104974-001-001_S1_L001`:`104974-001-016_S1_L001`))) %>%
  purrr::imap(~ gene_tpm_filtered_binary$gene_id[gene_tpm_filtered_binary[, .x, drop = T]])

gene_tpm_pk_filtered_binary_list <- purrr::set_names(colnames(gene_tpm_pk_filtered_binary %>% select(`104974-001-001_S1_L001`:`104974-001-016_S1_L001`))) %>%
  purrr::imap(~ gene_tpm_pk_filtered_binary$gene_id[gene_tpm_pk_filtered_binary[, .x, drop = T]])

comparison_centpip <- samples %>%
  filter(WBC_depletion == "centpip") %>%
  select(sample)

comparison_plasmo <- samples %>%
  filter(WBC_depletion == "plasmo") %>%
  select(sample)

comparison_pmacs <- samples %>%
  filter(WBC_depletion == "pmacs") %>%
  select(sample)

comparison_cellulose <- samples %>%
  filter(WBC_depletion == "cellulose") %>%
  select(sample)

comparison_nofilter <- samples %>%
  filter(WBC_depletion == "nofilter") %>%
  filter(!sample == "104974-001-006_S1_L001")

comparison_mrna_qiagen <- samples %>%
  filter(kit == "mRNA_qiagen_FSglob")

comparison_trna_qiagen <- samples %>%
  filter(kit == "tRNA_qiagen_FSglobH")

comparison_mrna_illumina <- samples %>%
  filter(kit == "mRNA_illumina")

draw_venn <- function(set_list, names, title, output) {
  VennDiagram::venn.diagram(
    x = set_list,
    category.names = names,
    # Circles
    lwd = 2,
    # lty = "blank",
    # col = palette.colors(3),
    # fill = palette.colors(3),
    # col = hcl.colors(3, palette = "viridis", alpha = 0.9),
    # fill = hcl.colors(3, palette = "viridis", alpha = 0.4),
    # col = hcl.colors(3, palette = "Set 2", alpha = 0.9),
    # fill = hcl.colors(3, palette = "Set 2", alpha = 0.4),
    col = hcl.colors(3, palette = "viridis", alpha = 0.9),
    fill = hcl.colors(3, palette = "viridis", alpha = 0.4),
    
    # Numbers
    cex = .9,
    fontface = "bold",
    fontfamily = "sans",

    # Set names
    cat.cex = 0.9,
    cat.fontface = "bold",
    cat.default.pos = "outer",
    cat.pos = c(-20, 20, 135),
    cat.dist = c(0.055, 0.055, 0.11),
    cat.fontfamily = "sans",
    rotation = 1,

    # title
    main = title,
    main.cex = 1.1,
    main.fontface = "bold",
    main.fontfamily = "sans",

    # output features
    height = 1200,
    width = 1200,
    resolution = 300,
    compression = "lzw",
    filename = output,
    output = TRUE,
    disable.logging = TRUE
  )
}

All genes - kits per filter

Gene counts were filtered on a minimum (raw) count of 10 for at least 1 sample before checking for presence/absence.

v <- draw_venn(
  gene_tpm_filtered_binary_list[comparison_centpip$sample],
  c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
  "Centpip",
  NULL
  # here("results/figures/venn-centpip.png")
)
grid.newpage()
grid.draw(v)

v <- draw_venn(
  gene_tpm_filtered_binary_list[comparison_plasmo$sample],
  c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
  "Plasmo",
  NULL
  # here("results/figures/venn-plasmo.png")
)
grid.newpage()
grid.draw(v)

v <- draw_venn(
  gene_tpm_filtered_binary_list[comparison_pmacs$sample],
  c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
  "PMACS",
  NULL
  # here("results/figures/venn-pmacs.png")
)
grid.newpage()
grid.draw(v)

v <- draw_venn(
  gene_tpm_filtered_binary_list[comparison_cellulose$sample],
  c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
  "Cellulose",
  NULL
  # here("results/figures/venn-cellulose.png")
)
grid.newpage()
grid.draw(v)

v <- draw_venn(
  gene_tpm_filtered_binary_list[comparison_nofilter$sample],
  c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
  "Cellulose",
  NULL
  # here("results/figures/venn-nofilter.png")
)
grid.newpage()
grid.draw(v)

P. knowlesi genes - kits per filter

Gene counts were filtered on a minimum (raw) count of 10 for at least 1 sample before checking for presence/absence.

v <- draw_venn(
  gene_tpm_pk_filtered_binary_list[comparison_centpip$sample],
  c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
  "Centpip",
  NULL
  # here("results/figures/venn-centpip_pk.png")
)
grid.newpage()
grid.draw(v)

v <- draw_venn(
  gene_tpm_pk_filtered_binary_list[comparison_plasmo$sample],
  c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
  "Plasmo",
  NULL
  # here("results/figures/venn-plasmo_pk.png")
)
grid.newpage()
grid.draw(v)

v <- draw_venn(
  gene_tpm_pk_filtered_binary_list[comparison_pmacs$sample],
  c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
  "PMACS",
  NULL
  # here("results/figures/venn-pmacs_pk.png")
)
grid.newpage()
grid.draw(v)

v <- draw_venn(
  gene_tpm_pk_filtered_binary_list[comparison_cellulose$sample],
  c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
  "Cellulose",
  NULL
  # here("results/figures/venn-cellulose_pk.png")
)
grid.newpage()
grid.draw(v)

v <- draw_venn(
  gene_tpm_pk_filtered_binary_list[comparison_nofilter$sample],
  c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
  "No filter",
  NULL
  # here("results/figures/venn-nofilter_pk.png")
)
grid.newpage()
grid.draw(v)

All genes - filters per kit

# Use upset plots instead of five-way venn diagram (alternatively check out proportional area venn)

# mRNA Qiagen
UpSetR::upset(
  as.data.frame(gene_tpm_filtered_binary %>%
    select((comparison_mrna_qiagen %>% select(sample) %>% pull())) %>%
    rename(all_of(c(
      "no filter" = "104974-001-001_S1_L001",
      "cent/pip" = "104974-001-002_S1_L001",
      "plasmodipur" = "104974-001-003_S1_L001",
      "PMACS" = "104974-001-004_S1_L001",
      "cellulose" = "104974-001-005_S1_L001"
    ))) %>%
    mutate_all(as.integer)),
  keep.order = TRUE,
  sets.x.label = "Genes per filter",
  mainbar.y.label = "Intersections for mRNA Qiagen",
  order.by = "freq"
)

# tRNA Qiagen
UpSetR::upset(
  as.data.frame(gene_tpm_filtered_binary %>%
    select((comparison_trna_qiagen %>% select(sample) %>% pull())) %>%
    rename(all_of(c(
      "no filter" = "104974-001-007_S1_L001",
      "cent/pip" = "104974-001-008_S1_L001",
      "plasmodipur" = "104974-001-009_S1_L001",
      "PMACS" = "104974-001-010_S1_L001",
      "cellulose" = "104974-001-011_S1_L001"
    ))) %>%
    mutate_all(as.integer)),
  keep.order = TRUE,
  sets.x.label = "Genes per filter",
  mainbar.y.label = "Intersections for tRNA Qiagen",
  order.by = "freq"
)

# mRNA Illumina
UpSetR::upset(
  as.data.frame(gene_tpm_filtered_binary %>%
    select((comparison_mrna_illumina %>% select(sample) %>% pull())) %>%
    rename(all_of(c(
      "no filter" = "104974-001-012_S1_L001",
      "cent/pip" = "104974-001-013_S1_L001",
      "plasmodipur" = "104974-001-014_S1_L001",
      "PMACS" = "104974-001-015_S1_L001",
      "cellulose" = "104974-001-016_S1_L001"
    ))) %>%
    mutate_all(as.integer)),
  keep.order = TRUE,
  sets.x.label = "Genes per filter",
  mainbar.y.label = "Intersections for mRNA Illumina",
  order.by = "freq"
)

Pk genes - filters per kit

# mRNA Qiagen
UpSetR::upset(
  as.data.frame(gene_tpm_pk_filtered_binary %>%
    select((comparison_mrna_qiagen %>% select(sample) %>% pull())) %>%
    rename(all_of(c(
      "no filter" = "104974-001-001_S1_L001",
      "cent/pip" = "104974-001-002_S1_L001",
      "plasmodipur" = "104974-001-003_S1_L001",
      "PMACS" = "104974-001-004_S1_L001",
      "cellulose" = "104974-001-005_S1_L001"
    ))) %>%
    mutate_all(as.integer)),
  keep.order = TRUE,
  sets.x.label = "Genes per filter",
  mainbar.y.label = "Intersections for mRNA Qiagen",
  order.by = "freq"
)

# tRNA Qiagen
UpSetR::upset(
  as.data.frame(gene_tpm_pk_filtered_binary %>%
    select((comparison_trna_qiagen %>% select(sample) %>% pull())) %>%
    rename(all_of(c(
      "no filter" = "104974-001-007_S1_L001",
      "cent/pip" = "104974-001-008_S1_L001",
      "plasmodipur" = "104974-001-009_S1_L001",
      "PMACS" = "104974-001-010_S1_L001",
      "cellulose" = "104974-001-011_S1_L001"
    ))) %>%
    mutate_all(as.integer)),
  keep.order = TRUE,
  sets.x.label = "Genes per filter",
  mainbar.y.label = "Intersections for tRNA Qiagen",
  order.by = "freq"
)

# mRNA Illumina
UpSetR::upset(
  as.data.frame(gene_tpm_pk_filtered_binary %>%
    select((comparison_mrna_illumina %>% select(sample) %>% pull())) %>%
    rename(all_of(c(
      "no filter" = "104974-001-012_S1_L001",
      "cent/pip" = "104974-001-013_S1_L001",
      "plasmodipur" = "104974-001-014_S1_L001",
      "PMACS" = "104974-001-015_S1_L001",
      "cellulose" = "104974-001-016_S1_L001"
    ))) %>%
    mutate_all(as.integer)),
  keep.order = TRUE,
  sets.x.label = "Genes per filter",
  mainbar.y.label = "Intersections for mRNA Illumina",
  order.by = "freq"
)

Sample-to-sample distances

Which samples are most similar to each other in terms of gene expression profile? Use Euclidean distance between samples on rlog transformed gene counts (filtered to genes with a minimum of 10 reads per sample).

heatmap_of_distances <- function(transformed_data, title) {
  
  sampleDists <- dist(t(assay(transformed_data)))
  sampleDistMatrix <- as.matrix(sampleDists)
  rownames(sampleDistMatrix) <- transformed_data$sample_short
  colnames(sampleDistMatrix) <- NULL
  
    
  # add grouping columns
  anno <- as.data.frame(colData(transformed_data)[, c("kit_name", "filter_name")]) %>%
    mutate(kit_name = as.factor(kit_name)) %>%
    mutate(filter_name = as.factor(filter_name)) %>%
    rename_with(~ str_to_title(str_replace(.x, "_", " ")))
  
  # set names of samples
  colnames(sampleDistMatrix) <- (transformed_data)$sample_short
  rownames(anno) <- (transformed_data)$sample_short
  
  # Adapted from: https://slowkow.com/notes/pheatmap-tutorial/
  mat_colors <- list(
    `Kit Name` = hcl.colors(length(unique(anno$`Kit Name`)), palette = "plasma", alpha = 1),
    `Filter Name` = palette.colors(length(unique(anno$`Filter Name`)), palette = "Okabe-Ito", alpha = 1)
  )
  names(mat_colors$`Kit Name`) <- unique(anno$`Kit Name`)
  names(mat_colors$`Filter Name`) <- unique(anno$`Filter Name`)
  
  # create heatmap
  p <- pheatmap(sampleDistMatrix,
    annotation_col = anno,
    annotation_colors = mat_colors,
    clustering_distance_rows = sampleDists,
    clustering_distance_cols = sampleDists,
    color = viridis(
      n = 256, alpha = 1,
      begin = 0, end = 1, option = "viridis"
    ),
    # main = "Sample-to-sample Euclidian distance based on all genes",
    main = title,
    scale = "none",
    show_rownames = TRUE,
    show_colnames=FALSE
  ) +
    theme(plot.title = element_markdown())
  
  return(p)
}

All genes (rlog)

heatmap_of_distances(rld_filtered, str_wrap("Sample-to-sample Euclidian distance based on all genes - rlog", 40))

## NULL

All genes (vst)

The VST transform leads to slightly different results: the two large blocks remains the same, but the positioning of samples 5, 11 and 16 differs. Overall, since the samples are so similar overall, slight deviations in the clustering of the distance matrix are not too surprising.

heatmap_of_distances(vsd_filtered, str_wrap("Sample-to-sample Euclidian distance based on all genes - vst", 40))

## NULL

P. knowlesi only (rlog)

heatmap_of_distances(rld_filtered[rowData(rld_filtered)$species == "pk", ], str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi* genes - rlog before subset", 40))

## NULL

P. knowlesi only (vst)

heatmap_of_distances(vsd_filtered[rowData(vsd_filtered)$species == "pk", ], str_wrap("Sample-to-sample Euclidian distance based on P. knowlesi genes - vst before subset", 40))

## NULL

P. knowlesi only (rlog) - transform after subset

heatmap_of_distances(rld_pk_filtered, str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi* genes - rlog after subset", 40))

## NULL

P. knowlesi only (vst) - transform after subset

heatmap_of_distances(vsd_pk_filtered, str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi* genes - vst after subset", 40))

## NULL

P. knowlesi protein-coding only (rlog)

heatmap_of_distances(rld_filtered[rowData(rld_filtered)$species == "pk" & rowData(rld_filtered)$gene_biotype == "protein_coding", ], str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi** protein-coding genes", 40))

## NULL

P. knowlesi protein-coding only (vst)

heatmap_of_distances(vsd_filtered[rowData(vsd_filtered)$species == "pk" & rowData(vsd_filtered)$gene_biotype == "protein_coding", ], str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi* protein-coding genes", 40))

## NULL

P. knowlesi protein-coding only (rlog) - transform after subset

heatmap_of_distances(rld_pk_filtered_coding, str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi* protein-coding genes - rlog after subset", 40))

## NULL

P. knowlesi protein-coding only (vst) - transform after subset

heatmap_of_distances(vsd_pk_filtered_coding, str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi* protein-coding genes - vst after subset", 40))

## NULL

Heatmap of count matrix

heatmap_of_counts <- function(raw_data, transformed_data, top_type = "mean", top_n = 20, center = TRUE) {
  # select n most highly expressed/variable genes
  if (top_type == "mean") {
    select <- order(rowMeans(counts(raw_data, normalized = FALSE)),
                    decreasing = TRUE
                    )[1:top_n]
  } else if (top_type == "median") {
    select <- order(rowMedians(counts(raw_data, normalized = FALSE)),
                    decreasing = TRUE
                    )[1:top_n]
  } else if (top_type == "var") {
    select <- head(order(rowVars(assay(transformed_data)), decreasing = TRUE), top_n)
  }
  
  # select transformed data and center if requested
  mat <- assay(transformed_data)[select, ]
  
  if (center == TRUE){
    mat <- mat - rowMeans(mat)
  }
  
  # add grouping columns
  anno <- as.data.frame(colData(transformed_data)[, c("kit_name", "filter_name")]) %>%
  mutate(kit_name = as.factor(kit_name)) %>%
  mutate(filter_name = as.factor(filter_name)) %>%
  rename_with(~ str_to_title(str_replace(.x, "_", " ")))
  
  # set names of samples
  colnames(mat) <- (transformed_data)$sample_short
  rownames(anno) <- (transformed_data)$sample_short
  
  # Adapted from: https://slowkow.com/notes/pheatmap-tutorial/
  mat_colors <- list(
    `Kit Name` = hcl.colors(length(unique(anno$`Kit Name`)), palette = "plasma", alpha = 1),
    `Filter Name` = palette.colors(length(unique(anno$`Filter Name`)), palette = "Okabe-Ito", alpha = 1)
  )
  names(mat_colors$`Kit Name`) <- unique(anno$`Kit Name`)
  names(mat_colors$`Filter Name`) <- unique(anno$`Filter Name`)

  # create heatmap
  pheatmap(mat,
    annotation_col = anno,
    annotation_colors = mat_colors,
    color = viridis(n = 256, alpha = 1, begin = 0, end = 1, option = "viridis"),
    # cluster_rows = FALSE,
    # cluster_cols = FALSE,
    show_rownames = FALSE,
  )
  
  # create table of genes
  gene_annotation %>% filter(gene_id %in% rownames(mat))
}

All genes - vst - top expressed (mean)

This plot clusters the top n=20 most highly expressed genes (mean) centred on the gene’s mean expression level across all samples using the VST data.

Several of the top expressed genes are globin and rRNA genes (and most reads are human).

heatmap_of_counts(gse_filtered, vsd_filtered, "mean", 20, TRUE)

All genes - rlog - top expressed (mean)

This plot cluster the top n=20 most highly expressed genes (mean) centred on the gene’s mean expression level across all samples using the rlog transformed data.

Several of the top expressed genes are globin and rRNA genes (and most reads are human).

heatmap_of_counts(gse_filtered, rld_filtered, "mean", 20, TRUE)

All genes - vst - top expressed (median)

This plot clusters the top n=20 most highly expressed genes (median) centred on the gene’s mean expression level across all samples using the VST data.

Several of the top expressed genes are globin and rRNA genes (and most reads are human).

heatmap_of_counts(gse_filtered, vsd_filtered, "median", 20, TRUE)

All genes - rlog - top expressed (median)

This plot cluster the top n=20 most highly expressed genes (median) centred on the gene’s mean expression level across all samples using the rlog transformed data.

Several of the top expressed genes are globin and rRNA genes (and most reads are human).

heatmap_of_counts(gse_filtered, rld_filtered, "median", 20, TRUE)

All genes - vst - top variable

This plot clusters the top n=20 most variable genes centred on the gene’s mean expression level across all samples using the VST data.

Several of the top variable genes are human/Pk rRNA genes (and most reads are human).

heatmap_of_counts(gse_filtered, vsd_filtered, "var", 20, TRUE)

All genes - rlog - top variable

This plot cluster the top n=20 most variable genes centred on the gene’s mean expression level across all samples using the rlog transformed data.

Several of the top variable genes are human genes (but globin and rRNA are rare).

heatmap_of_counts(gse_filtered, rld_filtered, "var", 20, TRUE)

Pk genes protein coding - vst - top expressed (mean)

This plot clusters the top n=20 most highly expressed genes centred on the gene’s mean expression level across all samples using the VST data.

The choice of transformation only has a minor effect on this subset of top-expressed genes.

heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk" & rowData(gse_filtered)$gene_biotype == "protein_coding"],
                  vsd_filtered[rowData(vsd_filtered)$species == "pk" & rowData(vsd_filtered)$gene_biotype == "protein_coding"], 
                  "mean", 20, TRUE)

Pk genes protein coding - rlog - top expressed (mean)

This plot cluster the top n=20 most highly expressed genes centred on the gene’s mean expression level across all samples using the rlog transformed data.

The choice of transformation only has a minor effect on this subset of top-expressed genes.

heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk" & rowData(gse_filtered)$gene_biotype == "protein_coding"],
                  rld_filtered[rowData(rld_filtered)$species == "pk" & rowData(rld_filtered)$gene_biotype == "protein_coding"], 
                  "mean", 20, TRUE)

Pk genes protein coding - vst - top variable

This plot clusters the top n=20 most variable genes centred on the gene’s mean expression level across all samples using the VST data.

The choice of transformation only has a minor effect on this subset of top-variable genes.

heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk" & rowData(gse_filtered)$gene_biotype == "protein_coding"],
                  vsd_filtered[rowData(vsd_filtered)$species == "pk" & rowData(vsd_filtered)$gene_biotype == "protein_coding"], 
                  "var", 20, TRUE)

Pk genes protein coding - rlog - top variable

This plot cluster the top n=20 most variable genes centred on the gene’s mean expression level across all samples using the rlog transformed data.

The choice of transformation only has a minor effect on this subset of top-variable genes.

heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk" & rowData(gse_filtered)$gene_biotype == "protein_coding"],
                  rld_filtered[rowData(rld_filtered)$species == "pk" & rowData(rld_filtered)$gene_biotype == "protein_coding"], 
                  "var", 20, TRUE)

Pk genes protein coding - vst - top expressed genes (mean) - transform after subsetting

This plot cluster the top n=20 most highly expressed Pk protein coding genes (centred on the gene’s mean expression level) across all samples using the vst transformed data, without clustering. The data was transformed after subsetting.

heatmap_of_counts(gse_pk_filtered_coding,
                  vsd_pk_filtered_coding, 
                  "mean", 20, TRUE)

Pk genes protein coding - rlog - top expressed genes (mean) - transform after subsetting

This plot cluster the top n=20 most highly expressed Pk protein coding genes (centred on the gene’s mean expression level) across all samples using the rlog transformed data, without clustering. The data was transformed after subsetting.

heatmap_of_counts(gse_pk_filtered_coding,
                  rld_pk_filtered_coding, 
                  "mean", 20, TRUE)

Pk genes protein coding - vst - top variable - transform after subsetting

This plot cluster the top n=20 most highly variable Pk protein coding genes across all samples using the vst transformed data, without clustering. The data was transformed after subsetting.

heatmap_of_counts(gse_pk_filtered_coding,
                  vsd_pk_filtered_coding, 
                  "var", 20, TRUE)

Pk genes protein coding - rlog - top variable - transform after subsetting

This plot cluster the top n=20 most highly variable Pk protein coding genes across all samples using the rlog transformed data, without clustering. The data was transformed after subsetting.

heatmap_of_counts(gse_pk_filtered_coding,
                  rld_pk_filtered_coding, 
                  "var", 20, TRUE)

Pk genes - vst - top expressed genes

This plot cluster the top n=20 most highly expressed Pk genes (centred on the gene’s mean expression level) across all samples using the rlog transformed data, without clustering.

Several of the top expressed genes in the previous plot are rRNA genes, but not all of them.

heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk"],
                  vsd_filtered[rowData(gse_filtered)$species == "pk"],
                  "mean", 20, TRUE)

Pk genes - rlog - top expressed genes

This plot cluster the top n=20 most highly expressed Pk genes (centred on the gene’s mean expression level) across all samples using the rlog transformed data, without clustering.

heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk"],
                  vsd_filtered[rowData(gse_filtered)$species == "pk"], 
                  "mean", 20, TRUE)

Pk genes - vst - top variable

This plot cluster the top n=20 most highly variable Pk genes across all samples using the rlog transformed data, without clustering.

heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk"],
                  vsd_filtered[rowData(gse_filtered)$species == "pk"],
                  "var", 20, TRUE)

Pk genes - rlog - top variable

This plot cluster the top n=20 most highly variable Pk genes across all samples using the rlog transformed data, without clustering.

heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk"],
                  vsd_filtered[rowData(gse_filtered)$species == "pk"], 
                  "var", 20, TRUE)

Pk genes - vst - top expressed genes - transform after subsetting

This plot cluster the top n=20 most highly expressed Pk genes (centred on the gene’s mean expression level) across all samples using the rlog transformed data, without clustering. The data was transformed after subsetting.

Several of the top expressed genes in the previous plot are rRNA genes, but not all of them.

heatmap_of_counts(gse_pk_filtered,
                  vsd_pk_filtered,
                  "mean", 20, TRUE)

Pk genes - rlog - top expressed genes - transform after subsetting

This plot cluster the top n=20 most highly expressed Pk genes (centred on the gene’s mean expression level) across all samples using the rlog transformed data, without clustering. The data was transformed after subsetting.

Several of the top expressed genes in the previous plot are rRNA genes, but not all of them.

heatmap_of_counts(gse_pk_filtered,
                  rld_pk_filtered, 
                  "mean", 20, TRUE)

Pk genes - vst - top variable - transform after subsetting

This plot cluster the top n=20 most highly variable Pk genes across all samples using the rlog transformed data, without clustering. The data was transformed after subsetting.

heatmap_of_counts(gse_pk_filtered,
                  vsd_pk_filtered,
                  "var", 20, TRUE)

Pk genes - rlog - top variable - transform after subsetting

This plot cluster the top n=20 most highly variable Pk genes across all samples using the rlog transformed data, without clustering. The data was transformed after subsetting.

heatmap_of_counts(gse_pk_filtered,
                  rld_pk_filtered, 
                  "var", 20, TRUE)

Expression count correlation plots

Protein-coding Pk genes - TPM

The following figure plots the (log-transformed) TPM of each protein-coding P. knowlesi gene (filtered on a minimum count of 10 for at least one sample) for all pairwise combinations of samples.

mat <- round(
          cor(
            gene_tpm_filtered %>%
              filter(species == "pk") %>%
              filter(gene_biotype == "protein_coding") %>% 
              select(`104974-001-001_S1_L001`:`104974-001-016_S1_L001`) %>% 
              rename_with(~samples %>% pull(sample_name) %>% as.character(), everything())
            ),
          1)
corr <- mat

ggcorrplot(corr,
           type = "lower",
           method = "circle",
           hc.order = FALSE,
           outline.color = "white",
           p.mat = cor_pmat(mat),
           lab = TRUE,
           legend.title = "Pearson correlation of TPM values") +
  scale_fill_gradientn(colors = viridis(256, option = 'D'), limits = c(-1,1),
                       name = "Pearson correlation")
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.

correlation_plot <- ggpairs(
  gene_tpm_filtered %>%
    filter(species == "pk") %>%
    filter(gene_biotype == "protein_coding"),
  columns = 3:18,
  columnLabels = samples$sample_short,
  upper = list(continuous = wrap("cor", size = 5)),
) +
  theme(
    axis.text = element_text(size = 9),
    axis.title = element_text(size = 9),
    strip.text.x = element_text(size = 11),
    strip.text.y = element_text(size = 10),
    axis.text.x = element_text(size = 6),
    axis.text.y = element_text(size = 9)
    )
  # scale_y_discrete()

correlation_plot

correlation_plot_log <- ggpairs(
  gene_tpm_filtered %>%
    filter(species == "pk") %>%
    filter(gene_biotype == "protein_coding"),
  columns = 3:18, # `104974-001-001_S1_L001`:`104974-001-002_S1_L001`,
  lower = list(continuous = "smooth"),
  upper = list(continuous = wrap("cor", size = 5)),
  columnLabels = samples$sample_short
) +
  theme(
    axis.text = element_text(size = 9),
    axis.title = element_text(size = 9),
    strip.text.x = element_text(size = 11),
    strip.text.y = element_text(size = 10),
    axis.text.x = element_text(size = 6),
    axis.text.y = element_text(size = 9)
    ) +
  scale_x_continuous(trans = "log1p") +
  scale_y_continuous(trans = "log1p")

correlation_plot_log

PCA plots

All PCA plots use the top 500 most variable genes and are filtered to only genes with a minimum count of 10 reads for at least one sample, unless otherwise stated.

All genes (rlog)

pcaData <- plotPCA(rld_filtered, intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
  geom_point(size = 4) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  labs(
    title = expression(paste("PCA of rlog-transformed counts of human and ", italic("P. knowlesi"), " genes")),
    color = "Library preparation kit", shape = "Filter method"
  ) +
  coord_fixed() +
  scale_color_viridis(discrete = TRUE) +
  theme_bw()

All genes (vst)

pcaData <- plotPCA(vsd_filtered, intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
  geom_point(size = 4) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  labs(
    title = expression(paste("PCA of VST counts of human and ", italic("P. knowlesi"), " genes")),
    color = "Library preparation kit", shape = "Filter method"
  ) +
  coord_fixed() +
  scale_color_viridis(discrete = TRUE) +
  theme_bw()

Pk genes (rlog)

pcaData <- plotPCA(rld_filtered[rowData(rld_filtered)$species == "pk", ], intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
  geom_point(size = 4) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  labs(
    title = expression(paste("PCA of rlog-transformed counts of ", italic("P. knowlesi"), " genes")),
    color = "Library preparation kit", shape = "Depletion method"
  ) +
  coord_fixed() +
  scale_color_viridis(discrete = TRUE) +
  theme_bw()

Pk genes (vst)

pcaData <- plotPCA(vsd_filtered[rowData(vsd_filtered)$species == "pk", ], intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
  geom_point(size = 4) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  labs(
    title = expression(paste("PCA of VST counts of ", italic("P. knowlesi"), " genes")),
    color = "Library preparation kit", shape = "Depletion method"
  ) +
  coord_fixed() +
  scale_color_viridis(discrete = TRUE) +
  theme_bw()

Pk genes protein-coding (rlog)

pcaData <- plotPCA(rld_filtered[rowData(rld_filtered)$species == "pk" & rowData(rld_filtered)$gene_biotype == "protein_coding"], intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
  geom_point(size = 4) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  labs(
    title = expression(paste("PCA of rlog-transformed counts of ", italic("P. knowlesi"), " genes")),
    color = "Library preparation kit", shape = "Depletion method"
  ) +
  coord_fixed() +
  scale_color_viridis(discrete = TRUE) +
  theme_bw()

Pk genes protein-coding (vst)

pcaData <- plotPCA(vsd_filtered[rowData(vsd_filtered)$species == "pk" & rowData(vsd_filtered)$gene_biotype == "protein_coding"], intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
  geom_point(size = 4) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  labs(
    title = expression(paste("PCA of VST counts of ", italic("P. knowlesi"), " genes")),
    color = "Library preparation kit", shape = "Depletion method"
  ) +
  coord_fixed() +
  scale_color_viridis(discrete = TRUE) +
  theme_bw()

Pk genes (rlog) - transform after subset

pcaData <- plotPCA(rld_pk_filtered, intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
  geom_point(size = 4) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  labs(
    title = expression(paste("PCA of rlog-transformed counts of ", italic("P. knowlesi"), " genes")),
    color = "Library preparation kit", shape = "Depletion method"
  ) +
  coord_fixed() +
  scale_color_viridis(discrete = TRUE) +
  theme_bw()

Pk genes (vst) - transform after subset

pcaData <- plotPCA(vsd_pk_filtered, intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
  geom_point(size = 4) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  labs(
    title = expression(paste("PCA of VST counts of ", italic("P. knowlesi"), " genes")),
    color = "Library preparation kit", shape = "Depletion method"
  ) +
  coord_fixed() +
  scale_color_viridis(discrete = TRUE) +
  theme_bw()

Pk genes protein-coding (rlog)

protein_gene_filter <- intersect(rownames(rowData(rld_filtered)), gene_annotation %>% filter(gene_biotype == "protein_coding" & species == "pk") %>% select(gene_id) %>% pull())
pcaData <- plotPCA(rld_filtered[protein_gene_filter, ], intgroup = c("kit_name", "filter_name"), returnData = TRUE, ntop = 500)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
  geom_point(size = 4) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  labs(
    title = expression(paste("PCA of rlog-transformed counts of ", italic("P. knowlesi"), " genes")),
    color = "Library preparation kit", shape = "Depletion method"
  ) +
  coord_fixed() +
  scale_color_viridis(discrete = TRUE) +
  theme_bw()

Pk genes protein-coding (vst)

protein_gene_filter <- intersect(rownames(rowData(vsd_filtered)), gene_annotation %>% filter(gene_biotype == "protein_coding" & species == "pk") %>% select(gene_id) %>% pull())
pcaData <- plotPCA(vsd_filtered[protein_gene_filter, ], intgroup = c("kit_name", "filter_name"), returnData = TRUE, ntop = 500)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
  geom_point(size = 4) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  labs(
    title = expression(paste("PCA of VST counts of ", italic("P. knowlesi"), " genes")),
    color = "Library preparation kit", shape = "Depletion method"
  ) +
  coord_fixed() +
  scale_color_viridis(discrete = TRUE) +
  theme_bw()

Pk genes protein-coding (rlog) - transform after subset

pcaData <- plotPCA(rld_pk_filtered_coding, intgroup = c("kit_name", "filter_name"), returnData = TRUE, ntop = 500)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
  geom_point(size = 4) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  labs(
    title = expression(paste("PCA of rlog counts of ", italic("P. knowlesi"), " protein coding genes")),
    color = "Library preparation kit", shape = "Depletion method"
  ) +
  coord_fixed() +
  scale_color_viridis(discrete = TRUE) +
  theme_bw()

Pk genes protein-coding (vst) - transform after subset

pcaData <- plotPCA(vsd_pk_filtered_coding, intgroup = c("kit_name", "filter_name"), returnData = TRUE, ntop = 500)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
  geom_point(size = 4) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  labs(
    title = expression(paste("PCA of VST counts of ", italic("P. knowlesi"), " protein coding genes")),
    color = "Library preparation kit", shape = "Depletion method"
  ) +
  coord_fixed() +
  scale_color_viridis(discrete = TRUE) +
  theme_bw()

Figures for publication

Sum of gene counts

Ordered by library - depletion - filter

samples_order <- samples %>% 
  arrange(factor(samples$kit, levels = c("mRNA_illumina", "mRNA_qiagen_FSglobH", "mRNA_qiagen_FSglob", "tRNA_qiagen_FSglobH")),
          factor(samples$WBC_depletion, levels = c("nofilter", "centpip", "plasmo", "pmacs", "cellulose"))) %>% 
  select(sample_name)

p2_a <- gene_counts_txi_filtered_long %>%
  left_join(samples, by = "sample") %>%
  group_by(sample_name, species) %>%
  summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
  mutate(per = 100 * sum_reads / sum(sum_reads)) %>% 
  # arrange(factor(sample_name, levels = samples_order %>% pull())) %>% 
  mutate(sample_name = factor(sample_name, levels = samples_order %>% pull())) %>% 
  ggplot(aes(x = sum_reads, y = sample_name, fill = species)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(
    option = "viridis", discrete = T,
    # limits = rev,
    # labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
  ) +
  scale_y_discrete(limits = rev) +
  scale_x_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(y = "Sample", x = "Sum of all gene counts", fill = "RNA type") +
  theme_classic() +
  ggtitle("A")

# print p2a to show y labels
p2_a

p2_a <- p2_a + theme(axis.title.y = element_blank(), axis.text.y = element_blank())

p2_b <- gene_tpm_filtered_long %>%
  left_join(samples, by = "sample") %>%
  group_by(sample_name, gene_type) %>%
  summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
  mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
  mutate(sample_name = factor(sample_name, levels = samples_order %>% pull())) %>% 
  ggplot(aes(x = sum_reads, y = sample_name, fill = gene_type)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_manual(
    values = c(`human` = "#440154FF", 
               `human_globin` = "#3B528BFF", 
               `human_rRNA` = "#21908CFF",
               `pk_rRNA` = "#5DC863FF",
               `pk` = "#FDE725FF"),
      labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")),
                 expression(italic("P. knowlesi rRNA")))
    ) +
  # scale_fill_viridis(
  #   option = "viridis", discrete = T,
  #   # limits = rev,
  #   labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")),
  #              expression(italic("P. knowlesi rRNA")))
  # ) +
  scale_y_discrete(limits = rev) +
  scale_x_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(y = "Sample", x = "Sum of TPM counts", fill = "RNA type") +
  theme_classic() +
  theme(axis.title.y = element_blank(), axis.text.y = element_blank()) +
  ggtitle("B")

p2_c <- gene_counts_txi_filtered_long %>%
  left_join(samples, by = "sample") %>%
  filter(species == "pk" & gene_biotype == "protein_coding") %>%
  group_by(sample_name,) %>%
  summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
  mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
  mutate(sample_name = factor(sample_name, levels = samples_order %>% pull())) %>% 
  ggplot(aes(x = sum_reads, y = sample_name)) +
  geom_bar(position = "stack", stat = "identity", fill = "#FDE725FF") +
  # scale_fill_manual(viridis::viridis(5)[4:5]) +
  # scale_fill_manual(
  #   values = c(pk = "#5DC863FF", pk_rRNA = "#FDE725FF"),
  #   # scale_fill_viridis(
  #   #   option = "viridis", discrete = T,
  #   #   # limits = rev,
  #   labels = c(expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
  # ) +
  scale_y_discrete(limits = rev) +
  scale_x_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(y = "Sample", x = str_wrap("Sum of *P. knowlesi* protein-coding gene counts", 10), fill = "RNA type") +
  theme_classic() +
  theme(axis.title.y = element_blank(), axis.text.y = element_blank(), axis.title.x = ggtext::element_markdown()) +
  ggtitle("C")

fig <- ggarrange(
  p2_a, p2_b, p2_c,
  ncol = 3,
  legend = "bottom",
  common.legend = TRUE,
  legend.grob = get_legend(p2_b)
  # widths = c(2,1,2)
)
fig

ggsave(filename = here("results/figures/gene_counts.svg"), 
      plot = fig, width=10, height=6)

Ordered by sample number (subtypes)

p2_a <- gene_counts_txi_filtered_long %>%
  left_join(samples, by = "sample") %>%
  group_by(sample_name, gene_type) %>%
  summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
  mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
  ggplot(aes(y = sum_reads, x = sample_name, fill = gene_type)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(
    option = "viridis", discrete = T,
    # limits = rev,
    labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
  ) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(x = "Sample", y = "Sum of gene counts", fill = "RNA type") +
  theme_classic() +
  ggtitle("A")

# print p2a to show y labels
p2_a

p2_a <- p2_a + theme(axis.title.y = element_blank(), axis.text.y = element_blank())


p2_b <- gene_tpm_filtered_long %>%
  left_join(samples, by = "sample") %>%
  group_by(sample_name, gene_type) %>%
  summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
  mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
  ggplot(aes(y = sum_reads, x = sample_name, fill = gene_type)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(
    option = "viridis", discrete = T,
    # limits = rev,
    labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
  ) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(x = "Sample", y = "Sum of TPM counts", fill = "RNA type") +
  theme_classic() +
  theme(axis.title.y = element_blank(), axis.text.y = element_blank()) +
  ggtitle("B")

p2_c <- gene_counts_txi_filtered_long %>%
  left_join(samples, by = "sample") %>%
  filter(species == "pk") %>%
  group_by(sample_name, gene_type) %>%
  summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
  mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
  ggplot(aes(y = sum_reads, x = sample_name, fill = gene_type)) +
  geom_bar(position = "stack", stat = "identity") +
  # scale_fill_manual(viridis::viridis(5)[4:5]) +
  scale_fill_manual(
    values = c(pk = "#5DC863FF", pk_rRNA = "#FDE725FF"),
    # scale_fill_viridis(
    #   option = "viridis", discrete = T,
    #   # limits = rev,
    labels = c(expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
  ) +
  coord_flip() +
  scale_x_discrete(limits = rev) +
  scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(x = "Sample", y = expression(paste("Sum of ", italic("P. knowlesi"), " gene counts")), fill = "RNA type") +
  theme_classic() +
  theme(axis.title.y = element_blank(), axis.text.y = element_blank()) +
  ggtitle("C")

fig <- ggarrange(
  p2_a, p2_b, p2_c,
  ncol = 3,
  common.legend = TRUE, legend = "bottom"
)
fig

Ordered by library - depletion - filter (subtypes)

samples_order <- samples %>% 
  arrange(factor(samples$kit, levels = c("mRNA_illumina", "mRNA_qiagen_FSglobH", "mRNA_qiagen_FSglob", "tRNA_qiagen_FSglobH")),
          factor(samples$WBC_depletion, levels = c("nofilter", "centpip", "plasmo", "pmacs", "cellulose"))) %>% 
  select(sample_name)

p2_a <- gene_counts_txi_filtered_long %>%
  left_join(samples, by = "sample") %>%
  group_by(sample_name, gene_type) %>%
  summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
  mutate(per = 100 * sum_reads / sum(sum_reads)) %>% 
  # arrange(factor(sample_name, levels = samples_order %>% pull())) %>% 
  mutate(sample_name = factor(sample_name, levels = samples_order %>% pull())) %>% 
  ggplot(aes(x = sum_reads, y = sample_name, fill = gene_type)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(
    option = "viridis", discrete = T,
    # limits = rev,
    labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
  ) +
  scale_y_discrete(limits = rev) +
  scale_x_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(y = "Sample", x = "Sum of gene counts", fill = "RNA type") +
  theme_classic() +
  ggtitle("A")

# print p2a to show y labels
p2_a

p2_a <- p2_a + theme(axis.title.y = element_blank(), axis.text.y = element_blank())

p2_b <- gene_tpm_filtered_long %>%
  left_join(samples, by = "sample") %>%
  group_by(sample_name, gene_type) %>%
  summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
  mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
  mutate(sample_name = factor(sample_name, levels = samples_order %>% pull())) %>% 
  ggplot(aes(x = sum_reads, y = sample_name, fill = gene_type)) +
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(
    option = "viridis", discrete = T,
    # limits = rev,
    labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
  ) +
  scale_y_discrete(limits = rev) +
  scale_x_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(y = "Sample", x = "Sum of TPM counts", fill = "RNA type") +
  theme_classic() +
  theme(axis.title.y = element_blank(), axis.text.y = element_blank()) +
  ggtitle("B")

p2_c <- gene_counts_txi_filtered_long %>%
  left_join(samples, by = "sample") %>%
  filter(species == "pk") %>%
  group_by(sample_name, gene_type) %>%
  summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
  mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
  mutate(sample_name = factor(sample_name, levels = samples_order %>% pull())) %>% 
  ggplot(aes(x = sum_reads, y = sample_name, fill = gene_type)) +
  geom_bar(position = "stack", stat = "identity") +
  # scale_fill_manual(viridis::viridis(5)[4:5]) +
  scale_fill_manual(
    values = c(pk = "#5DC863FF", pk_rRNA = "#FDE725FF"),
    # scale_fill_viridis(
    #   option = "viridis", discrete = T,
    #   # limits = rev,
    labels = c(expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
  ) +
  scale_y_discrete(limits = rev) +
  scale_x_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
  labs(y = "Sample", x = expression(paste("Sum of ", italic("P. knowlesi"), " gene counts")), fill = "RNA type") +
  theme_classic() +
  theme(axis.title.y = element_blank(), axis.text.y = element_blank()) +
  ggtitle("C")

fig <- ggarrange(
  p2_a, p2_b, p2_c,
  ncol = 3,
  common.legend = TRUE, legend = "bottom"
)
fig

Heatmaps of distances

heatmap_of_distances <- function(transformed_data, title) {
  
  sampleDists <- dist(t(assay(transformed_data)))
  sampleDistMatrix <- as.matrix(sampleDists)
  rownames(sampleDistMatrix) <- transformed_data$sample_short
  colnames(sampleDistMatrix) <- NULL
  
    
  # add grouping columns
  anno <- as.data.frame(colData(transformed_data)[, c("kit_name", "filter_name")]) %>%
    mutate(kit_name = as.factor(kit_name)) %>%
    mutate(filter_name = as.factor(filter_name)) %>%
    rename_with(~ str_to_title(str_replace(.x, "_", " ")))
  
  # set names of samples
  colnames(sampleDistMatrix) <- (transformed_data)$sample_short
  rownames(anno) <- (transformed_data)$sample_short
  
  # Adapted from: https://slowkow.com/notes/pheatmap-tutorial/
  mat_colors <- list(
    `Kit Name` = hcl.colors(length(unique(anno$`Kit Name`)), palette = "plasma", alpha = 1),
    `Filter Name` = palette.colors(length(unique(anno$`Filter Name`)), palette = "Okabe-Ito", alpha = 1)
  )
  names(mat_colors$`Kit Name`) <- unique(anno$`Kit Name`)
  names(mat_colors$`Filter Name`) <- unique(anno$`Filter Name`)
  
  # create heatmap
  p <- pheatmap(sampleDistMatrix,
    annotation_col = anno,
    annotation_colors = mat_colors,
    clustering_distance_rows = sampleDists,
    clustering_distance_cols = sampleDists,
    color = viridis(
      n = 256, alpha = 1,
      begin = 0, end = 1, option = "viridis"
    ),
    # main = "Sample-to-sample Euclidian distance based on all genes",
    # main = title,
    scale = "none",
    show_rownames = TRUE,
    show_colnames=FALSE
  ) 
  
  return(p)
}

Pk protein coding genes (rlog before subset)

p <- heatmap_of_distances(rld_filtered[rowData(rld_filtered)$species == "pk" & rowData(rld_filtered)$gene_biotype == "protein_coding", ], str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi** protein-coding genes", 40))
ggsave(filename = here("results/figures/genes_pk_coding_sample_dist_rlog.svg"), 
      plot = p)
p

Pk protein coding genes (rlog after subset)

p <- heatmap_of_distances(rld_pk_filtered_coding, str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi* protein-coding genes - rlog after subset", 40))
ggsave(filename = here("results/figures/genes_pk_coding_sample_dist_rlog_transform_after_subset.svg"), 
      plot = p)
p

Heatmaps of count matrix

heatmap_of_counts <- function(raw_data, transformed_data, top_type = "mean", top_n = 20, center = TRUE) {
  # select n most highly expressed/variable genes
  if (top_type == "mean") {
    select <- order(rowMeans(counts(raw_data, normalized = FALSE)),
                    decreasing = TRUE
                    )[1:top_n]
  } else if (top_type == "median") {
    select <- order(rowMedians(counts(raw_data, normalized = FALSE)),
                    decreasing = TRUE
                    )[1:top_n]
  } else if (top_type == "var") {
    select <- head(order(rowVars(assay(transformed_data)), decreasing = TRUE), top_n)
  }
  
  # select transformed data and center if requested
  mat <- assay(transformed_data)[select, ]
  
  if (center == TRUE){
    mat <- mat - rowMeans(mat)
  }
  
  # add grouping columns
  anno <- as.data.frame(colData(transformed_data)[, c("kit_name", "filter_name")]) %>%
  mutate(kit_name = as.factor(kit_name)) %>%
  mutate(filter_name = as.factor(filter_name)) %>%
  rename_with(~ str_to_title(str_replace(.x, "_", " ")))
  
  # set names of samples
  colnames(mat) <- (transformed_data)$sample_short
  rownames(anno) <- (transformed_data)$sample_short
  
  # Adapted from: https://slowkow.com/notes/pheatmap-tutorial/
  mat_colors <- list(
    `Kit Name` = hcl.colors(length(unique(anno$`Kit Name`)), palette = "plasma", alpha = 1),
    `Filter Name` = palette.colors(length(unique(anno$`Filter Name`)), palette = "Okabe-Ito", alpha = 1)
  )
  names(mat_colors$`Kit Name`) <- unique(anno$`Kit Name`)
  names(mat_colors$`Filter Name`) <- unique(anno$`Filter Name`)

  # create heatmap
  pheatmap(mat,
    annotation_col = anno,
    annotation_colors = mat_colors,
    color = viridis(n = 256, alpha = 1, begin = 0, end = 1, option = "viridis"),
    # cluster_rows = FALSE,
    # cluster_cols = FALSE,
    show_rownames = FALSE,
  )
}

Pk protein coding genes (top variable, rlog before subset)

p <- heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk" & rowData(gse_filtered)$gene_biotype == "protein_coding"],
                  rld_filtered[rowData(rld_filtered)$species == "pk" & rowData(rld_filtered)$gene_biotype == "protein_coding"], 
                  "var", 20, TRUE)
ggsave(filename = here("results/figures/genes_pk_coding_top_var_rlog.svg"), 
      plot = p)
p

Pk protein coding genes (top variable, rlog after subset)

p <- heatmap_of_counts(gse_pk_filtered_coding,
                  rld_pk_filtered_coding, 
                  "var", 20, TRUE)
ggsave(filename = here("results/figures/genes_pk_coding_top_var_rlog_transform_after_subset.svg"), 
      plot = p)
p

Pk all genes (top variable, rlog before subset)

p <- heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk"],
                  vsd_filtered[rowData(gse_filtered)$species == "pk"], 
                  "var", 20, TRUE)
ggsave(filename = here("results/figures/genes_pk_all_top_var_rlog.svg"), 
      plot = p)
p

Pk all genes (top variable, rlog after subset)

p <- heatmap_of_counts(gse_pk_filtered,
                  rld_pk_filtered, 
                  "var", 20, TRUE)
ggsave(filename = here("results/figures/genes_pk_all_top_var_rlog_transform_after_subset.svg"), 
      plot = p)
p

PCA plots

Pk genes protein-coding (rlog) - transform after subset

pcaData <- plotPCA(rld_pk_filtered_coding, intgroup = c("kit_name", "filter_name"), returnData = TRUE, ntop = 500)
percentVar <- round(100 * attr(pcaData, "percentVar"))
p <- ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
  geom_point(size = 4) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  labs(
    title = expression(paste("PCA of rlog counts of ", italic("P. knowlesi"), " protein coding genes")),
    color = "Library preparation kit", shape = "Depletion method"
  ) +
  coord_fixed() +
  scale_color_viridis(discrete = TRUE) +
  theme_bw()

ggsave(filename = here("results/figures/pca_pk_coding_rlog_transform_after_subset.svg"), 
      plot = p)
p

Gene recovery venn diagrams

See previous section for in-line plots; the following code chunk only exports the relevant files.

draw_venn <- function(set_list, names, title, output) {
  VennDiagram::venn.diagram(
    x = set_list,
    category.names = names,
    
    # Circles
    lwd = 2,
    col = hcl.colors(3, palette = "viridis", alpha = 0.9),
    fill = hcl.colors(3, palette = "viridis", alpha = 0.4),

    # Numbers
    cex = 1.4,
    fontface = "bold",
    fontfamily = "sans",

    # Set names
    cat.cex = 1.1,
    cat.fontface = "bold",
    cat.default.pos = "outer",
    cat.pos = c(-15, 15, 125),
    cat.dist = c(0.055, 0.055, 0.09),
    cat.fontfamily = "sans",
    rotation = 1,

    # title
    main = title,
    main.cex = 1.4,
    main.fontface = "bold",
    main.fontfamily = "sans",

    # output features
    # height = 1400,
    # width = 1400,
    units = "px",
    resolution = 600,
    # compression = "lzw",
    filename = output,
    output = TRUE,
    disable.logging = TRUE,
    imagetype = "png"
  )
}

p <- draw_venn(
  gene_tpm_filtered_binary_list[comparison_centpip$sample],
  # c("mRNA Qiagen FS globin", "total RNA Qiagen FS globin/rRNA", "mRNA Illumina"),
  c("Qiagen mRNA FSglob", "Qiagen total RNA FSglobH", "Illumina mRNA"),
  "Cent/pip",
  here("results/figures/venn-centpip.png")
)

draw_venn(
  gene_tpm_filtered_binary_list[comparison_plasmo$sample],
  c("Qiagen mRNA FSglob", "Qiagen total RNA FSglobH", "Illumina mRNA"),
  "Plasmo",
  here("results/figures/venn-plasmo.png")
)

draw_venn(
  gene_tpm_filtered_binary_list[comparison_pmacs$sample],
  c("Qiagen mRNA FSglob", "Qiagen total RNA FSglobH", "Illumina mRNA"),
  "PMACS",
  here("results/figures/venn-pmacs.png")
)

draw_venn(
  gene_tpm_filtered_binary_list[comparison_cellulose$sample],
  c("Qiagen mRNA FSglob", "Qiagen total RNA FSglobH", "Illumina mRNA"),
  "Cellulose",
  here("results/figures/venn-cellulose.png")
)

draw_venn(
  gene_tpm_filtered_binary_list[comparison_nofilter$sample],
  c("Qiagen mRNA FSglob", "Qiagen total RNA FSglobH", "Illumina mRNA"),
  "Cellulose",
  here("results/figures/venn-nofilter.png")
)

draw_venn(
  gene_tpm_pk_filtered_binary_list[comparison_centpip$sample],
  c("Qiagen mRNA FSglob", "Qiagen total RNA FSglobH", "Illumina mRNA"),
  "Cent/pip",
  here("results/figures/venn-centpip_pk.png")
)

draw_venn(
  gene_tpm_pk_filtered_binary_list[comparison_plasmo$sample],
  c("Qiagen mRNA FSglob", "Qiagen total RNA FSglobH", "Illumina mRNA"),
  "Plasmo",
  here("results/figures/venn-plasmo_pk.png")
)

draw_venn(
  gene_tpm_pk_filtered_binary_list[comparison_pmacs$sample],
  c("Qiagen mRNA FSglob", "Qiagen total RNA FSglobH", "Illumina mRNA"),
  "PMACS",
  here("results/figures/venn-pmacs_pk.png")
)

draw_venn(
  gene_tpm_pk_filtered_binary_list[comparison_cellulose$sample],
  c("Qiagen mRNA FSglob", "Qiagen total RNA FSglobH", "Illumina mRNA"),
  "Cellulose",
  here("results/figures/venn-cellulose_pk.png")
)

draw_venn(
  gene_tpm_pk_filtered_binary_list[comparison_nofilter$sample],
  c("Qiagen mRNA FSglob", "Qiagen total RNA FSglobH", "Illumina mRNA"),
  "No filter",
  here("results/figures/venn-nofilter_pk.png")
)

Gene expression count correlation plots

Simple correlation matrix

mat <- round(
          cor(
            gene_tpm_filtered %>%
              filter(species == "pk") %>%
              filter(gene_biotype == "protein_coding") %>% 
              select(`104974-001-001_S1_L001`:`104974-001-016_S1_L001`) %>% 
              rename_with(~samples %>% pull(sample_name) %>% as.character(), everything())
            ),
          1)
corr <- mat

p <- ggcorrplot(corr,
           type = "lower",
           method = "circle",
           hc.order = FALSE,
           outline.color = "white",
           p.mat = cor_pmat(mat),
           lab = TRUE,
           legend.title = "Pearson correlation of TPM values") +
  scale_fill_gradientn(colors = viridis(256, option = 'D'), limits = c(-1,1), name = "Pearson correlation")
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
ggsave(filename = here("results/figures/pk_coding_tpm_correlation.svg"), 
      plot = p,
      width = 10, height = 10)
p

Ordered by mRNA - total RNA library preparation kit:

mat <- round(
          cor(
            gene_tpm_filtered %>%
              filter(species == "pk") %>%
              filter(gene_biotype == "protein_coding") %>% 
              select(`104974-001-001_S1_L001`:`104974-001-016_S1_L001`) %>% 
              rename_with(~samples %>% pull(sample_name) %>% as.character(), everything()) %>% 
              relocate(samples_order %>% pull())
            ),
          1)
corr <- mat

p <- ggcorrplot(corr,
           type = "lower",
           method = "circle",
           hc.order = FALSE,
           outline.color = "white",
           p.mat = cor_pmat(mat),
           lab = TRUE,
           legend.title = "Pearson correlation of TPM values") +
  scale_fill_gradientn(colors = viridis(256, option = 'D'), limits = c(-1,1), name = "Pearson correlation")
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
ggsave(filename = here("results/figures/pk_coding_tpm_correlation_ordered.svg"), 
      plot = p,
      width = 10, height = 10)
p

Correlation pairs plots

Pk protein coding genes only:

correlation_plot

ggsave(filename = here("results/figures/correlation_protein_coding_genes_pk.png"), 
      plot = correlation_plot,
      height = 4000, width = 6000, units = "px")
ggsave(filename = here("results/figures/correlation_protein_coding_genes_pk.svg"),
      plot = correlation_plot,
      height = 4000, width = 6000, units = "px")

# order by tRNA - mRNA

samples_order <- samples %>% 
  arrange(factor(samples$kit,
                 levels = c("mRNA_illumina", "mRNA_qiagen_FSglobH", "mRNA_qiagen_FSglob", "tRNA_qiagen_FSglobH")),
          factor(samples$WBC_depletion,
                 levels = c("nofilter", "centpip", "plasmo", "pmacs", "cellulose")))

correlation_plot_ordered <- ggpairs(
  gene_tpm_filtered %>%
    filter(species == "pk") %>%
    filter(gene_biotype == "protein_coding") %>% 
    relocate(
      samples_order %>% select(sample) %>% pull()
      ),
  columns = 1:16,
  # columnLabels = samples_order %>% select(sample_short) %>% pull(),
  columnLabels = samples_order %>% select(sample_short) %>% mutate(sample_short = factor(sample_short, levels = sample_short)) %>% pull(),
  upper = list(continuous = wrap("cor", size = 5)),
) +
  theme(
    axis.text = element_text(size = 9),
    axis.title = element_text(size = 9),
    strip.text.x = element_text(size = 11),
    strip.text.y = element_text(size = 10),
    axis.text.x = element_text(size = 6),
    axis.text.y = element_text(size = 9)
    )
  # scale_y_discrete()

correlation_plot_ordered

ggsave(filename = here("results/figures/correlation_protein_coding_genes_pk_ordered.png"), 
      plot = correlation_plot_ordered,
      height = 4000, width = 6000, units = "px")
ggsave(filename = here("results/figures/correlation_protein_coding_genes_pk_ordered.svg"),
      plot = correlation_plot_ordered,
      height = 4000, width = 6000, units = "px")

All Pk genes:

correlation_plot_log

ggsave(here("results/figures/correlation_protein_coding_genes_logscaled.png"), 
       plot = correlation_plot_log,
       height = 4000, width = 6000, units = "px")
ggsave(here("results/figures/correlation_protein_coding_genes_logscaled.svg"), 
       plot = correlation_plot_log,
       height = 4000, width = 6000, units = "px")

# order by tRNA-mRNA

correlation_plot_log_ordered <- ggpairs(
  gene_tpm_filtered %>%
    filter(species == "pk") %>%
    filter(gene_biotype == "protein_coding") %>% 
    relocate(
      samples_order %>% select(sample) %>% pull()
    ),
  columns = 1:16, # `104974-001-001_S1_L001`:`104974-001-002_S1_L001`,
  lower = list(continuous = "smooth"),
  upper = list(continuous = wrap("cor", size = 5)),
  columnLabels = samples_order %>% select(sample_short) %>% mutate(sample_short = factor(sample_short, levels = sample_short)) %>% pull(),,
) +
  theme(
    axis.text = element_text(size = 9),
    axis.title = element_text(size = 9),
    strip.text.x = element_text(size = 11),
    strip.text.y = element_text(size = 10),
    axis.text.x = element_text(size = 6),
    axis.text.y = element_text(size = 9)
    ) +
  scale_x_continuous(trans = "log1p") +
  scale_y_continuous(trans = "log1p")

correlation_plot_log

ggsave(here("results/figures/correlation_protein_coding_genes_logscaled_ordered.png"), 
       plot = correlation_plot_log_ordered,
       height = 4000, width = 6000, units = "px")
ggsave(here("results/figures/correlation_protein_coding_genes_logscaled_ordered.svg"), 
       plot = correlation_plot_log_ordered,
       height = 4000, width = 6000, units = "px")